1 /*
  2  * CDDL HEADER START
  3  *
  4  * The contents of this file are subject to the terms of the
  5  * Common Development and Distribution License, Version 1.0 only
  6  * (the "License").  You may not use this file except in compliance
  7  * with the License.
  8  *
  9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 10  * or http://www.opensolaris.org/os/licensing.
 11  * See the License for the specific language governing permissions
 12  * and limitations under the License.
 13  *
 14  * When distributing Covered Code, include this CDDL HEADER in each
 15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 16  * If applicable, add the following below this CDDL HEADER, with the
 17  * fields enclosed by brackets "[]" replaced with your own identifying
 18  * information: Portions Copyright [yyyy] [name of copyright owner]
 19  *
 20  * CDDL HEADER END
 21  */
 22 /*
 23  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 24  * Use is subject to license terms.
 25  */
 26 
 27 
 28  /* Copyright (c) 2006, The Ohio State University. All rights reserved.
 29   *
 30   * Portions of this source code is developed by the team members of
 31   * The Ohio State University's Network-Based Computing Laboratory (NBCL),
 32   * headed by Professor Dhabaleswar K. (DK) Panda.
 33   *
 34   * Acknowledgements to contributions from developors:
 35   *   Ranjit Noronha: noronha@cse.ohio-state.edu
 36   *   Lei Chai      : chail@cse.ohio-state.edu
 37   *   Weikuan Yu    : yuw@cse.ohio-state.edu
 38   *
 39   */
 40 
 41 #pragma ident   "@(#)rpcib.c    1.29    06/01/25 SMI"
 42 
 43 /*
 44  * The rpcib plugin. Implements the interface for RDMATF's
 45  * interaction with IBTF.
 46  */
 47 
 48 #include <sys/param.h>
 49 #include <sys/types.h>
 50 #include <sys/user.h>
 51 #include <sys/systm.h>
 52 #include <sys/sysmacros.h>
 53 #include <sys/proc.h>
 54 #include <sys/socket.h>
 55 #include <sys/file.h>
 56 #include <sys/stream.h>
 57 #include <sys/strsubr.h>
 58 #include <sys/stropts.h>
 59 #include <sys/errno.h>
 60 #include <sys/kmem.h>
 61 #include <sys/debug.h>
 62 #include <sys/systm.h>
 63 #include <sys/pathname.h>
 64 #include <sys/kstat.h>
 65 #include <sys/t_lock.h>
 66 #include <sys/ddi.h>
 67 #include <sys/cmn_err.h>
 68 #include <sys/time.h>
 69 #include <sys/isa_defs.h>
 70 #include <sys/callb.h>
 71 #include <sys/sunddi.h>
 72 #include <sys/sunndi.h>
 73 
 74 /* #define IB_FMR_SUP */
 75 /* #define CLNT_POLL_CQ */
 76 #include <sys/ib/ibtl/ibti.h>
 77 #include <rpc/rpc.h>
 78 #include <rpc/ib.h>
 79 
 80 #include <sys/modctl.h>
 81 
 82 #include <sys/pathname.h>
 83 #include <sys/kstr.h>
 84 #include <sys/sockio.h>
 85 #include <sys/vnode.h>
 86 #include <sys/tiuser.h>
 87 #include <net/if.h>
 88 #include <sys/cred.h>
 89 #include <rpc/rpc_rdma.h>
 90 
 91 int num_clients = 0;
 92 volatile uint32_t is_server = 0;
 93 
 94 extern char *inet_ntop(int, const void *, char *, int);
 95 
 96 
 97 /*
 98  * Prototype declarations for driver ops
 99  */
100 
101 static int      rpcib_attach(dev_info_t *, ddi_attach_cmd_t);
102 static int      rpcib_getinfo(dev_info_t *, ddi_info_cmd_t,
103                             void *, void **);
104 static int      rpcib_detach(dev_info_t *, ddi_detach_cmd_t);
105 
106 
107 /* rpcib cb_ops */
108 static struct cb_ops rpcib_cbops = {
109         nulldev,                /* open */
110         nulldev,                /* close */
111         nodev,                  /* strategy */
112         nodev,                  /* print */
113         nodev,                  /* dump */
114         nodev,                  /* read */
115         nodev,                  /* write */
116         nodev,                  /* ioctl */
117         nodev,                  /* devmap */
118         nodev,                  /* mmap */
119         nodev,                  /* segmap */
120         nochpoll,               /* poll */
121         ddi_prop_op,            /* prop_op */
122         NULL,                   /* stream */
123         D_MP,                   /* cb_flag */
124         CB_REV,                 /* rev */
125         nodev,                  /* int (*cb_aread)() */
126         nodev                   /* int (*cb_awrite)() */
127 };
128 
129 
130 
131 
132 /*
133  * Device options
134  */
135 static struct dev_ops rpcib_ops = {
136         DEVO_REV,               /* devo_rev, */
137         0,                      /* refcnt  */
138         rpcib_getinfo,          /* info */
139         nulldev,                /* identify */
140         nulldev,                /* probe */
141         rpcib_attach,           /* attach */
142         rpcib_detach,           /* detach */
143         nodev,                  /* reset */
144         &rpcib_cbops,                   /* driver ops - devctl interfaces */
145         NULL,                   /* bus operations */
146         NULL                    /* power */
147 };
148 
149 /*
150  * Module linkage information.
151  */
152 
153 static struct modldrv rib_modldrv = {
154         &mod_driverops,                         /* Driver module */
155         "RPCIB plugin driver, ver 1.29", /* Driver name and version */
156         &rpcib_ops,             /* Driver ops */
157 };
158 
159 static struct modlinkage rib_modlinkage = {
160         MODREV_1,
161         (void *)&rib_modldrv,
162         NULL
163 };
164 
165 #ifdef SERVER_REG_CACHE
166 typedef struct cache_struct {
167 avl_node_t         avl_link;
168 rib_lrc_entry_t    r;
169 uint32_t           len;
170 uint32_t           elements;
171 kmutex_t           node_lock;
172 } cache_avl_struct_t;
173 
174 
175 #if 1
176 int rib_total_buffers = 0;
177 #endif
178 #endif
179 /*
180  * rib_stat: private data pointer used when registering
181  *      with the IBTF.  It is returned to the consumer
182  *      in all callbacks.
183  */
184 static rpcib_state_t *rib_stat = NULL;
185 
186 #define RNR_RETRIES     IBT_RNR_INFINITE_RETRY 
187 #define MAX_PORTS       2
188 
189 #ifdef IB_FMR_SUP 
190 #define IB_FMR_DIRTY_MARK       32 
191 #define IB_FMR_MAX_SIZE         1048576 
192 /*#define IB_FMR_MAX_SIZE         32768 */ 
193 #endif 
194  
195 int preposted_rbufs = RDMA_BUFS_GRANT; 
196 int send_threshold = 1;
197 
198 /*
199  * State of the plugin.
200  * ACCEPT = accepting new connections and requests.
201  * NO_ACCEPT = not accepting new connection and requests.
202  * This should eventually move to rpcib_state_t structure, since this
203  * will tell in which state the plugin is for a particular type of service
204  * like NFS, NLM or v4 Callback deamon. The plugin might be in accept
205  * state for one and in no_accept state for the other.
206  */
207 int             plugin_state;
208 kmutex_t        plugin_state_lock;
209 
210 
211 /*
212  * RPCIB RDMATF operations
213  */
214 #if defined(MEASURE_POOL_DEPTH)
215 static void rib_posted_rbufs(uint32_t x) { return;}
216 #endif
217 static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle);
218 static rdma_stat rib_disconnect(CONN *conn);
219 static void rib_listen(struct rdma_svc_data *rd);
220 static void rib_listen_stop(struct rdma_svc_data *rd);
221 static rdma_stat rib_registermem(CONN *conn, caddr_t  adsp, caddr_t buf, uint_t buflen, 
222         struct mrc *buf_handle);
223 static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf,
224         struct mrc buf_handle);
225 static rdma_stat rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, 
226                 caddr_t buf, uint_t buflen, struct mrc *buf_handle); 
227 static rdma_stat rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, 
228          struct mrc buf_handle); 
229 #ifdef SERVER_REG_CACHE 
230 static rdma_stat rib_registermemsync(CONN *conn,  caddr_t adsp, caddr_t buf, uint_t buflen, 
231         struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, void *lrc); 
232 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf, 
233         struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle, void *); 
234 #else 
235 static rdma_stat rib_registermemsync(CONN *conn,  caddr_t adsp, caddr_t buf, uint_t buflen, 
236         struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle);
237 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf,
238         struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle);
239 
240 #endif
241 static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle,
242         caddr_t buf, int len, int cpu);
243 
244 static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf);
245 
246 static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf);
247 static void *rib_rbuf_alloc(CONN *, rdma_buf_t *);
248 
249 static void rib_rbuf_free(CONN *conn, int ptype, void *buf);
250 
251 static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid);
252 #if defined (CLNT_INTERRUPT_COAL)
253 static void rib_scq_free(caddr_t);
254 static rdma_stat rib_send_bl(CONN *conn, struct clist *cl, uint32_t msgid);
255 #endif
256 #if defined(ASYNC_SERVER_DEREG)
257 static rdma_stat rib_send_nw(CONN *conn, struct clist *cl, uint32_t msgid, caddr_t, caddr_t, int, caddr_t, int, int, int);
258 #endif
259 #if defined(ASYNC_CLIENT_DEREG)
260 static void insert_queue(CONN  *conn, struct clist  *rwc);
261 #endif
262 static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid);
263 static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid);
264 static rdma_stat rib_post_recv(CONN *conn, struct clist *cl);
265 static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid);
266 static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait);
267 static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait);
268 static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rib_hca_t **);
269 static rdma_stat rib_conn_get(struct netbuf *, int addr_type, void *, CONN **);
270 static rdma_stat rib_conn_release(CONN *conn);
271 static rdma_stat rib_getinfo(rdma_info_t *info);
272 #ifdef DYNAMIC_CREDIT_CONTROL
273 void rib_get_resource_info(CONN *, int *, int *);
274 #endif
275 
276 #ifdef SERVER_REG_CACHE
277 static rib_lrc_entry_t *rib_get_server_cache_buf(CONN *conn, uint32_t len);
278 static void rib_free_server_cache_buf(CONN *conn, rib_lrc_entry_t *buf);
279 static void rib_destroy_cache(rib_hca_t *hca);
280 static void
281             rib_server_side_cache_reclaim(void *argp);
282 static int avl_compare(const void *t1,const void *t2);
283 #endif
284 
285 static rdma_stat rib_register_ats(rib_hca_t *);
286 static void rib_deregister_ats();
287 static void rib_stop_services(rib_hca_t *);
288 
289 /*
290  * RPCIB addressing operations
291  */
292 char ** get_ip_addrs(int *count);
293 int get_interfaces(TIUSER *tiptr, int *num);
294 int find_addrs(TIUSER *tiptr, char **addrs, int num_ifs);
295 int get_ibd_ipaddr(rpcib_ibd_insts_t *);
296 rpcib_ats_t *get_ibd_entry(ib_gid_t *, ib_pkey_t, rpcib_ibd_insts_t *);
297 void rib_get_ibd_insts(rpcib_ibd_insts_t *);
298 #if defined(ASYNC_SERVER_DEREG)||defined(ASYNC_CLIENT_DEREG)
299 static int clist_deregister1(CONN *, struct clist *, bool_t );
300 #endif
301 
302 #if defined(ASYNC_CLIENT_DEREG) 
303 typedef struct async_dereg { 
304    struct async_dereg  *forw; 
305    struct async_dereg  *back; 
306                CONN    c_conn; 
307                struct clist c_clist; 
308 } ASYNC; 
309 static void async_dereg_thread(caddr_t arg); 
310 extern pri_t            minclsyspri;            /* priority for taskq */ 
311 static ASYNC   rqueue; 
312 static kmutex_t        at_mutex;          
313 static kcondvar_t      at_cond;           
314 #endif 
315 /*
316  * RDMA operations the RPCIB module exports
317  */
318 static rdmaops_t rib_ops = {
319         rib_reachable,
320         rib_conn_get,
321         rib_conn_release,
322         rib_listen,
323         rib_listen_stop,
324         rib_registermem,
325         rib_deregistermem,
326         rib_registermemsync,
327         rib_deregistermemsync,
328         rib_syncmem,
329         rib_reg_buf_alloc,
330         rib_reg_buf_free,
331         rib_send,
332 #if defined (CLNT_INTERRUPT_COAL)
333         rib_send_bl,
334 #endif
335 #if defined(ASYNC_SERVER_DEREG)
336         rib_send_nw,
337 #endif
338         rib_send_resp,
339         rib_post_resp,
340         rib_post_recv,
341         rib_recv,
342         rib_read,
343         rib_write,
344         rib_getinfo, 
345 #ifdef SERVER_REG_CACHE 
346         rib_get_server_cache_buf, 
347         rib_free_server_cache_buf, 
348 #endif 
349 #ifdef DYNAMIC_CREDIT_CONTROL 
350         rib_get_resource_info, 
351 #endif 
352 #if defined(ASYNC_CLIENT_DEREG) 
353         insert_queue, 
354 #endif 
355 };
356 
357 /*
358  * RDMATF RPCIB plugin details
359  */
360 static rdma_mod_t rib_mod = {
361         "ibtf",         /* api name */
362         RDMATF_VERS_1,
363         0,
364         &rib_ops,   /* rdma op vector for ibtf */
365 };
366 
367 static rdma_stat open_hcas(rpcib_state_t *);
368 static rdma_stat rib_qp_init(rib_qp_t *, int);
369 static void rib_svc_scq_handler(ibt_cq_hdl_t, void *);
370 static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *);
371 static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *);
372 static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *);
373 static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num);
374 #ifdef IB_FMR_SUP 
375 static rdma_stat rib_reg_mem_fmr(rib_hca_t *,   caddr_t adsp,caddr_t, uint_t, ibt_mr_flags_t, 
376         ibt_mr_hdl_t *, ibt_ma_hdl_t *, ibt_pmr_desc_t *); 
377 #endif 
378 static rdma_stat rib_reg_mem(rib_hca_t *,  caddr_t adsp, caddr_t, uint_t, ibt_mr_flags_t, 
379         ibt_mr_hdl_t *, ibt_mr_desc_t *);
380 static rdma_stat rib_reg_mem_user(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t,
381         ibt_mr_hdl_t *, ibt_mr_desc_t *, caddr_t);
382 static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, ibt_path_info_t *);
383 static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *,
384         rib_qp_t **);
385 static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t,
386         rib_qp_t **);
387 static rdma_stat rib_sendwait(rib_qp_t *, struct send_wid *);
388 static struct send_wid *rib_init_sendwait(uint32_t, int, rib_qp_t *);
389 static int rib_free_sendwait(struct send_wid *);
390 static struct rdma_done_list *rdma_done_add(rib_qp_t *qp, uint32_t xid);
391 static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd);
392 static void rdma_done_rem_list(rib_qp_t *);
393 static void rdma_done_notify(rib_qp_t *qp, uint32_t xid);
394 
395 static void rib_async_handler(void *,
396         ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *);
397 static rdma_stat rib_rem_rep(rib_qp_t *, struct reply *);
398 static struct svc_recv *rib_init_svc_recv(rib_qp_t *, ibt_wr_ds_t *);
399 static int rib_free_svc_recv(struct svc_recv *);
400 static struct recv_wid *rib_create_wid(rib_qp_t *, ibt_wr_ds_t *, uint32_t);
401 static void rib_free_wid(struct recv_wid *);
402 static rdma_stat rib_disconnect_channel(CONN *, rib_conn_list_t *);
403 static void rib_detach_hca(rib_hca_t *);
404 static rdma_stat rib_chk_srv_ats(rib_hca_t *, struct netbuf *, int,
405         ibt_path_info_t *);
406 
407 /*
408  * Registration with IBTF as a consumer
409  */
410 static struct ibt_clnt_modinfo_s rib_modinfo = {
411         IBTI_V2,
412         IBT_GENERIC,
413         rib_async_handler,      /* async event handler */
414         NULL,                   /* Memory Region Handler */
415         "nfs/ib"
416 };
417 
418 /*
419  * Global strucuture
420  */
421 
422 typedef struct rpcib_s {
423         dev_info_t      *rpcib_dip;
424         kmutex_t        rpcib_mutex;
425 } rpcib_t;
426 
427 rpcib_t rpcib;
428 
429 /*
430  * /etc/system controlled variable to control
431  * debugging in rpcib kernel module.
432  * Set it to values greater that 1 to control
433  * the amount of debugging messages required.
434  */
435 int rib_debug = 0;
436 #if defined(CLNT_POLL_CQ) 
437 int max_poll_count = 500;  
438 #endif 
439 static int ats_running = 0;
440 
441 
442 int
443 _init(void)
444 {
445         int             error;
446 
447         error = mod_install((struct modlinkage *)&rib_modlinkage);
448         if (error != 0) {
449                 /*
450                  * Could not load module
451                  */
452                 return (error);
453         }
454         mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL);
455 
456         return (0);
457 }
458 
459 int
460 _fini()
461 {
462         int status;
463 
464         if ((status = rdma_unregister_mod(&rib_mod)) != RDMA_SUCCESS) {
465                 return (EBUSY);
466         }
467 
468         rib_deregister_ats();
469 
470         /*
471          * Remove module
472          */
473         if ((status = mod_remove(&rib_modlinkage)) != 0) {
474                 (void) rdma_register_mod(&rib_mod);
475                 return (status);
476         }
477         mutex_destroy(&plugin_state_lock);
478         return (0);
479 }
480 
481 int
482 _info(struct modinfo *modinfop)
483 {
484         return (mod_info(&rib_modlinkage, modinfop));
485 }
486 
487 
488 /*
489  * rpcib_getinfo()
490  * Given the device number, return the devinfo pointer or the
491  * instance number.
492  * Note: always succeed DDI_INFO_DEVT2INSTANCE, even before attach.
493  */
494 
495 /*ARGSUSED*/
496 static int
497 rpcib_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
498 {
499         int ret = DDI_SUCCESS;
500 
501         switch (cmd) {
502         case DDI_INFO_DEVT2DEVINFO:
503                 if (rpcib.rpcib_dip != NULL)
504                         *result = rpcib.rpcib_dip;
505                 else {
506                         *result = NULL;
507                         ret = DDI_FAILURE;
508                 }
509                 break;
510 
511         case DDI_INFO_DEVT2INSTANCE:
512                 *result = NULL;
513                 break;
514 
515         default:
516                 ret = DDI_FAILURE;
517         }
518         return (ret);
519 }
520 
521 static int
522 rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
523 {
524         ibt_status_t    ibt_status;
525         rdma_stat       r_status;
526 
527         switch (cmd) {
528         case DDI_ATTACH:
529                 break;
530         case DDI_RESUME:
531                 return (DDI_SUCCESS);
532         default:
533                 return (DDI_FAILURE);
534         }
535 
536         mutex_init(&rpcib.rpcib_mutex, NULL, MUTEX_DRIVER, NULL);
537 
538         mutex_enter(&rpcib.rpcib_mutex);
539         if (rpcib.rpcib_dip != NULL) {
540                 mutex_exit(&rpcib.rpcib_mutex);
541                 return (DDI_FAILURE);
542         }
543         rpcib.rpcib_dip = dip;
544         mutex_exit(&rpcib.rpcib_mutex);
545         /*
546          * Create the "rpcib" minor-node.
547          */
548         if (ddi_create_minor_node(dip,
549             "rpcib", S_IFCHR, 0, DDI_PSEUDO, 0) != DDI_SUCCESS) {
550                 /* Error message, no cmn_err as they print on console */
551                 return (DDI_FAILURE);
552         }
553 
554         if (rib_stat == NULL) {
555                 rib_stat = kmem_zalloc(sizeof (*rib_stat), KM_SLEEP);
556                 mutex_init(&rib_stat->open_hca_lock, NULL, MUTEX_DRIVER, NULL);
557         }
558 
559         rib_stat->hca_count = ibt_get_hca_list(&rib_stat->hca_guids);
560         if (rib_stat->hca_count < 1) {
561                 mutex_destroy(&rib_stat->open_hca_lock);
562                 kmem_free(rib_stat, sizeof (*rib_stat));
563                 rib_stat = NULL;
564                 return (DDI_FAILURE);
565         }
566 
567         ibt_status = ibt_attach(&rib_modinfo, dip,
568                         (void *)rib_stat, &rib_stat->ibt_clnt_hdl);
569         if (ibt_status != IBT_SUCCESS) {
570                 ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count);
571                 mutex_destroy(&rib_stat->open_hca_lock);
572                 kmem_free(rib_stat, sizeof (*rib_stat));
573                 rib_stat = NULL;
574                 return (DDI_FAILURE);
575         }
576 
577         mutex_enter(&rib_stat->open_hca_lock);
578         if (open_hcas(rib_stat) != RDMA_SUCCESS) {
579                 ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count);
580                 (void) ibt_detach(rib_stat->ibt_clnt_hdl);
581                 mutex_exit(&rib_stat->open_hca_lock);
582                 mutex_destroy(&rib_stat->open_hca_lock);
583                 kmem_free(rib_stat, sizeof (*rib_stat));
584                 rib_stat = NULL;
585                 return (DDI_FAILURE);
586         }
587         mutex_exit(&rib_stat->open_hca_lock);
588 
589         /*
590          * Register with rdmatf
591          */
592         rib_mod.rdma_count = rib_stat->hca_count;
593         r_status = rdma_register_mod(&rib_mod);
594         if (r_status != RDMA_SUCCESS && r_status != RDMA_REG_EXIST) {
595                 rib_detach_hca(rib_stat->hca);
596                 ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count);
597                 (void) ibt_detach(rib_stat->ibt_clnt_hdl);
598                 mutex_destroy(&rib_stat->open_hca_lock);
599                 kmem_free(rib_stat, sizeof (*rib_stat));
600                 rib_stat = NULL;
601                 return (DDI_FAILURE);
602         }
603 
604 
605         return (DDI_SUCCESS);
606 }
607 
608 /*ARGSUSED*/
609 static int
610 rpcib_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
611 {
612         switch (cmd) {
613 
614         case DDI_DETACH:
615                 break;
616 
617         case DDI_SUSPEND:
618         default:
619                 return (DDI_FAILURE);
620         }
621 
622         /*
623          * Detach the hca and free resources
624          */
625         mutex_enter(&plugin_state_lock);
626         plugin_state = NO_ACCEPT;
627         mutex_exit(&plugin_state_lock);
628         rib_detach_hca(rib_stat->hca);
629         ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count);
630         (void) ibt_detach(rib_stat->ibt_clnt_hdl);
631 
632         mutex_enter(&rpcib.rpcib_mutex);
633         rpcib.rpcib_dip = NULL;
634         mutex_exit(&rpcib.rpcib_mutex);
635 
636         mutex_destroy(&rpcib.rpcib_mutex);
637         return (DDI_SUCCESS);
638 }
639 
640 
641 static void
642 rib_deregister_ats()
643 {
644         rib_hca_t               *hca;
645         rib_service_t           *srv_list, *to_remove;
646         ibt_status_t            ibt_status;
647 
648         /*
649          * deregister the Address Translation Service.
650          */
651         hca = rib_stat->hca;
652         rw_enter(&hca->service_list_lock, RW_WRITER);
653         srv_list = hca->ats_list;
654         while (srv_list != NULL) {
655                 to_remove = srv_list;
656                 srv_list = to_remove->srv_next;
657 
658                 ibt_status = ibt_deregister_ar(hca->ibt_clnt_hdl,
659                                 &to_remove->srv_ar);
660                 if (ibt_status != IBT_SUCCESS) {
661 #ifdef DEBUG
662                     if (rib_debug) {
663                         cmn_err(CE_WARN, "_fini: "
664                             "ibt_deregister_ar FAILED"
665                                 " status: %d", ibt_status);
666                     }
667 #endif
668                 } else {
669                     mutex_enter(&rib_stat->open_hca_lock);
670                     ats_running = 0;
671                     mutex_exit(&rib_stat->open_hca_lock);
672 #ifdef DEBUG
673                     if (rib_debug) {
674 
675                         cmn_err(CE_NOTE, "_fini: "
676                             "Successfully unregistered"
677                             " ATS service: %s",
678                             to_remove->srv_name);
679                     }
680 #endif
681                 }
682                 kmem_free(to_remove, sizeof (rib_service_t));
683         }
684         hca->ats_list = NULL;
685         rw_exit(&hca->service_list_lock);
686 }
687 
688 static void rib_rbufpool_free(rib_hca_t *, int);
689 static void rib_rbufpool_deregister(rib_hca_t *, int);
690 static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype);
691 static struct reply *rib_addreplylist(rib_qp_t *, uint32_t);
692 static rdma_stat rib_rem_replylist(rib_qp_t *);
693 static int rib_remreply(rib_qp_t *, struct reply *);
694 static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *);
695 static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *);
696 
697 
698 /*
699  * One CQ pair per HCA
700  */
701 static rdma_stat
702 rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler,
703         rib_cq_t **cqp, rpcib_state_t *ribstat)
704 {
705         rib_cq_t        *cq;
706         ibt_cq_attr_t   cq_attr;
707         uint32_t        real_size;
708         ibt_status_t    status;
709         rdma_stat       error = RDMA_SUCCESS;
710 
711         cq = kmem_zalloc(sizeof (rib_cq_t), KM_SLEEP);
712         cq->rib_hca = hca;
713         cq_attr.cq_size = cq_size;
714         cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
715         status = ibt_alloc_cq(hca->hca_hdl, &cq_attr, &cq->rib_cq_hdl,
716             &real_size);
717         if (status != IBT_SUCCESS) {
718                 cmn_err(CE_WARN, "rib_create_cq: ibt_alloc_cq() failed,"
719                                 " status=%d", status);
720                 error = RDMA_FAILED;
721                 goto fail;
722         }
723         ibt_set_cq_handler(cq->rib_cq_hdl, cq_handler, ribstat);
724 
725         /*
726          * Enable CQ callbacks. CQ Callbacks are single shot
727          * (e.g. you have to call ibt_enable_cq_notify()
728          * after each callback to get another one).
729          */
730         status = ibt_enable_cq_notify(cq->rib_cq_hdl, IBT_NEXT_COMPLETION);
731         if (status != IBT_SUCCESS) {
732                 cmn_err(CE_WARN, "rib_create_cq: "
733                         "enable_cq_notify failed, status %d", status);
734                 error = RDMA_FAILED;
735                 goto fail;
736         }
737         *cqp = cq;
738 
739         return (error);
740 fail:
741         if (cq->rib_cq_hdl)
742                 (void) ibt_free_cq(cq->rib_cq_hdl);
743         if (cq)
744                 kmem_free(cq, sizeof (rib_cq_t));
745         return (error);
746 }
747 
748 static rdma_stat
749 open_hcas(rpcib_state_t *ribstat)
750 {
751         rib_hca_t               *hca;
752         ibt_status_t            ibt_status;
753         rdma_stat               status;
754         ibt_hca_portinfo_t      *pinfop;
755         ibt_pd_flags_t          pd_flags = IBT_PD_NO_FLAGS;
756         uint_t                  size, cq_size;
757         int                     i;
758 #ifdef  IB_FMR_SUP 
759         ibt_fmr_pool_attr_t     fmr_attr; 
760         uint_t                  h_page_sz; 
761 #endif 
762         ASSERT(MUTEX_HELD(&ribstat->open_hca_lock));
763         if (ribstat->hcas == NULL)
764                 ribstat->hcas = kmem_zalloc(ribstat->hca_count *
765                                     sizeof (rib_hca_t), KM_SLEEP);
766 
767         /*
768          * Open a hca and setup for RDMA
769          */
770         for (i = 0; i < ribstat->hca_count; i++) {
771                 ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl,
772                                 ribstat->hca_guids[i],
773                                 &ribstat->hcas[i].hca_hdl);
774                 if (ibt_status != IBT_SUCCESS) {
775                         cmn_err(CE_WARN, "open_hcas: ibt_open_hca (%d) "
776                                 "returned %d", i, ibt_status);
777                         continue;
778                 }
779                 ribstat->hcas[i].hca_guid = ribstat->hca_guids[i];
780                 hca = &(ribstat->hcas[i]);
781                 hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl;
782                 hca->state = HCA_INITED;
783 
784                 /*
785                  * query HCA info
786                  */
787                 ibt_status = ibt_query_hca(hca->hca_hdl, &hca->hca_attrs);
788                 if (ibt_status != IBT_SUCCESS) {
789                         cmn_err(CE_WARN, "open_hcas: ibt_query_hca "
790                             "returned %d (hca_guid 0x%llx)",
791                             ibt_status, (longlong_t)ribstat->hca_guids[i]);
792                         goto fail1;
793                 }
794 
795                 /*
796                  * One PD (Protection Domain) per HCA.
797                  * A qp is allowed to access a memory region
798                  * only when it's in the same PD as that of
799                  * the memory region.
800                  */
801                 ibt_status = ibt_alloc_pd(hca->hca_hdl, pd_flags, &hca->pd_hdl);
802                 if (ibt_status != IBT_SUCCESS) {
803                         cmn_err(CE_WARN, "open_hcas: ibt_alloc_pd "
804                                 "returned %d (hca_guid 0x%llx)",
805                                 ibt_status, (longlong_t)ribstat->hca_guids[i]);
806                         goto fail1;
807                 }
808 
809                 /*
810                  * query HCA ports
811                  */
812                 ibt_status = ibt_query_hca_ports(hca->hca_hdl,
813                                 0, &pinfop, &hca->hca_nports, &size);
814                 if (ibt_status != IBT_SUCCESS) {
815                         cmn_err(CE_WARN, "open_hcas: "
816                                 "ibt_query_hca_ports returned %d "
817                                 "(hca_guid 0x%llx)",
818                                 ibt_status, (longlong_t)hca->hca_guid);
819                         goto fail2;
820                 }
821                 hca->hca_ports = pinfop;
822                 hca->hca_pinfosz = size;
823                 pinfop = NULL;
824 
825                 cq_size = DEF_CQ_SIZE; /* default cq size */
826                 /*
827                  * Create 2 pairs of cq's (1 pair for client
828                  * and the other pair for server) on this hca.
829                  * If number of qp's gets too large, then several
830                  * cq's will be needed.
831                  */
832                 status = rib_create_cq(hca, cq_size, rib_svc_rcq_handler,
833                                 &hca->svc_rcq, ribstat);
834                 if (status != RDMA_SUCCESS) {
835                         goto fail3;
836                 }
837 
838                 status = rib_create_cq(hca, cq_size, rib_svc_scq_handler,
839                                 &hca->svc_scq, ribstat);
840                 if (status != RDMA_SUCCESS) {
841                         goto fail3;
842                 }
843 
844                 status = rib_create_cq(hca, cq_size, rib_clnt_rcq_handler,
845                                 &hca->clnt_rcq, ribstat);
846                 if (status != RDMA_SUCCESS) {
847                         goto fail3;
848                 }
849 
850                 status = rib_create_cq(hca, cq_size, rib_clnt_scq_handler,
851                                 &hca->clnt_scq, ribstat);
852                 if (status != RDMA_SUCCESS) {
853                         goto fail3;
854                 }
855 
856                 /*
857                  * Create buffer pools.
858                  * Note rib_rbuf_create also allocates memory windows.
859                  */
860                 hca->recv_pool = rib_rbufpool_create(hca,
861                                         RECV_BUFFER, MAX_BUFS);
862                 if (hca->recv_pool == NULL) {
863                         cmn_err(CE_WARN, "open_hcas: recv buf pool failed\n");
864                         goto fail3;
865                 }
866 
867                 hca->send_pool = rib_rbufpool_create(hca,
868                                         SEND_BUFFER, MAX_BUFS);
869                 if (hca->send_pool == NULL) {
870                         cmn_err(CE_WARN, "open_hcas: send buf pool failed\n");
871                         rib_rbufpool_destroy(hca, RECV_BUFFER);
872                         goto fail3;
873                 }
874 #ifdef  IB_FMR_SUP
875                 /* Global FMR POOL */
876                 bzero(&fmr_attr, sizeof (ibt_fmr_pool_attr_t));
877 
878                 h_page_sz = hca->hca_attrs.hca_page_sz * 1024;
879 
880                 fmr_attr.fmr_max_pages_per_fmr =
881                     (IB_FMR_MAX_SIZE / h_page_sz) + 2;
882                 fmr_attr.fmr_pool_size = MAX_BUFS * 2;
883                 fmr_attr.fmr_dirty_watermark = IB_FMR_DIRTY_MARK;
884                 fmr_attr.fmr_page_sz = h_page_sz;
885                 fmr_attr.fmr_cache = B_FALSE;
886                 fmr_attr.fmr_flags = IBT_MR_SLEEP |
887                     IBT_MR_ENABLE_LOCAL_WRITE |
888                     IBT_MR_ENABLE_REMOTE_READ |
889                     IBT_MR_ENABLE_REMOTE_WRITE;
890                 fmr_attr.fmr_func_hdlr = NULL;
891 
892                 if (rib_debug > 1) {
893                         cmn_err(CE_NOTE, "open_hcas: ibt_create_fmr_pool:");
894                         cmn_err(CE_NOTE, "fmr_page_sz %d, fmr_pool_sz %d, "
895                             "max_pages_per_fmr %d", fmr_attr.fmr_page_sz,
896                             fmr_attr.fmr_pool_size,
897                             fmr_attr.fmr_max_pages_per_fmr);
898                 }
899 
900                 ibt_status = ibt_create_fmr_pool(hca->hca_hdl, hca->pd_hdl,
901                     &fmr_attr, &hca->fmr_pool);
902                 if (ibt_status != IBT_SUCCESS) {
903                         cmn_err(CE_WARN, "open_hcas: Global FMR pool creation "
904                             "failed: %d\n", ibt_status);
905                         rib_rbufpool_destroy(hca, RECV_BUFFER);
906                         rib_rbufpool_destroy(hca, SEND_BUFFER);
907                         goto fail3;
908                 }
909 #endif
910 #ifdef SERVER_REG_CACHE
911                 cmn_err(CE_NOTE,"Registration Cache enabled\n");
912                 { 
913                 cache_avl_struct_t my_avl_node;
914                 hca->server_side_cache =
915                      kmem_cache_create("rib_server_side_cache",
916                      sizeof (cache_avl_struct_t), 0, 
917                      NULL,
918                      NULL, 
919                      rib_server_side_cache_reclaim,                     
920                      hca, NULL, 0);
921                  avl_create(&hca->avl_tree,
922                             avl_compare,
923                             sizeof(cache_avl_struct_t),
924                             (uint_t)&my_avl_node.avl_link-(uint_t)&my_avl_node);
925                 /* mutex_init(&hca->avl_lock, NULL, MUTEX_DEFAULT, NULL);*/
926                 rw_init(&hca->avl_rw_lock, NULL, RW_DRIVER, hca->iblock);
927                 hca->avl_init = TRUE;
928                 
929                 }
930 #endif
931 
932 #if defined(ASYNC_CLIENT_DEREG)
933         rqueue.forw = rqueue.back = &rqueue;
934         mutex_init(&at_mutex, NULL, MUTEX_DEFAULT, NULL);
935         cv_init(&at_cond, NULL, CV_DEFAULT, NULL);
936         (void) thread_create(NULL, 0, async_dereg_thread, NULL, 0, &p0,
937                     TS_RUN, minclsyspri);
938 #endif
939                 /*
940                  * Initialize the registered service list and
941                  * the lock
942                  */
943                 hca->service_list = NULL;
944                 rw_init(&hca->service_list_lock, NULL, RW_DRIVER, hca->iblock);
945 
946                 mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
947                 cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL);
948                 rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER,
949                         hca->iblock);
950                 rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER,
951                         hca->iblock);
952                 rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock);
953                 mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock);
954                 hca->inuse = TRUE;
955                 /*
956                  * XXX One hca only. Add multi-hca functionality if needed
957                  * later.
958                  */
959                 ribstat->hca = hca;
960                 ribstat->nhca_inited++;
961                 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
962                 break;
963 
964 fail3:
965                 ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
966 fail2:
967                 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
968 fail1:
969                 (void) ibt_close_hca(hca->hca_hdl);
970 
971         }
972         if (ribstat->hca != NULL)
973                 return (RDMA_SUCCESS);
974         else
975                 return (RDMA_FAILED);
976 }
977 
978 /*
979  * Callback routines
980  */
981 
982 /*
983  * SCQ handlers
984  */
985 /* ARGSUSED */
986 static void
987 rib_clnt_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
988 {
989         ibt_status_t    ibt_status;
990         ibt_wc_t        wc;
991         int             i;
992 
993         /*
994          * Re-enable cq notify here to avoid missing any
995          * completion queue notification.
996          */
997         (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
998 
999         ibt_status = IBT_SUCCESS;
1000         while (ibt_status != IBT_CQ_EMPTY) {
1001             bzero(&wc, sizeof (wc));
1002             ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1003             if (ibt_status != IBT_SUCCESS)
1004                 return;
1005 
1006         /*
1007          * Got a send completion
1008          */
1009             if (wc.wc_id != NULL) {     /* XXX can it be otherwise ???? */
1010                 struct send_wid *wd = (struct send_wid *)(uintptr_t)wc.wc_id;
1011                 CONN    *conn = qptoc(wd->qp);
1012 
1013                 mutex_enter(&wd->sendwait_lock);
1014                 switch (wc.wc_status) {
1015                 case IBT_WC_SUCCESS:
1016                         wd->status = RDMA_SUCCESS;
1017                         break;
1018                 case IBT_WC_WR_FLUSHED_ERR:
1019                         wd->status = RDMA_FAILED;
1020                         break;
1021                 default:
1022 /*
1023  *    RC Send Q Error Code              Local state     Remote State
1024  *    ====================              ===========     ============
1025  *    IBT_WC_BAD_RESPONSE_ERR             ERROR           None
1026  *    IBT_WC_LOCAL_LEN_ERR                ERROR           None
1027  *    IBT_WC_LOCAL_CHAN_OP_ERR            ERROR           None
1028  *    IBT_WC_LOCAL_PROTECT_ERR            ERROR           None
1029  *    IBT_WC_MEM_WIN_BIND_ERR             ERROR           None
1030  *    IBT_WC_REMOTE_INVALID_REQ_ERR       ERROR           ERROR
1031  *    IBT_WC_REMOTE_ACCESS_ERR            ERROR           ERROR
1032  *    IBT_WC_REMOTE_OP_ERR                ERROR           ERROR
1033  *    IBT_WC_RNR_NAK_TIMEOUT_ERR          ERROR           None
1034  *    IBT_WC_TRANS_TIMEOUT_ERR            ERROR           None
1035  *    IBT_WC_WR_FLUSHED_ERR               None            None
1036  */
1037 #ifdef DEBUG
1038         if (rib_debug > 1) {
1039             if (wc.wc_status != IBT_WC_SUCCESS) {
1040                     cmn_err(CE_NOTE, "rib_clnt_scq_handler: "
1041                         "WR completed in error, wc.wc_status:%d, "
1042                         "wc_id:%llx\n", wc.wc_status, (longlong_t)wc.wc_id);
1043             }
1044         }
1045 #endif
1046                         /*
1047                          * Channel in error state. Set connection to
1048                          * ERROR and cleanup will happen either from
1049                          * conn_release  or from rib_conn_get
1050                          */
1051                         wd->status = RDMA_FAILED;
1052                         mutex_enter(&conn->c_lock);
1053                         if (conn->c_state != C_DISCONN_PEND)
1054                                 conn->c_state = C_ERROR;
1055                         mutex_exit(&conn->c_lock);
1056                         break;
1057                 }
1058                 if (wd->cv_sig == 1) {
1059                         /*
1060                          * Notify poster
1061                          */
1062                         cv_signal(&wd->wait_cv);
1063                         mutex_exit(&wd->sendwait_lock);
1064                 } else {
1065                         /*
1066                          * Poster not waiting for notification.
1067                          * Free the send buffers and send_wid
1068                          */
1069                         for (i = 0; i < wd->nsbufs; i++) {
1070                                 rib_rbuf_free(qptoc(wd->qp), SEND_BUFFER,
1071                                         (void *)(uintptr_t)wd->sbufaddr[i]);
1072                         }
1073                         mutex_exit(&wd->sendwait_lock);
1074                         (void) rib_free_sendwait(wd);
1075                 }
1076             }
1077         }
1078 }
1079 
1080 #if defined (CLNT_INTERRUPT_COAL)
1081 static void
1082 rib_scq_free(caddr_t widd)
1083 {
1084         struct send_wid *wd = (struct send_wid *)widd;
1085         ibt_status_t    ibt_status;
1086         ibt_wc_t        wc;
1087         int             i;
1088         CONN    *conn = qptoc(wd->qp);
1089 
1090                 wc.wc_status = RDMA_SUCCESS;
1091                 mutex_enter(&wd->sendwait_lock);
1092                 switch (wc.wc_status) {
1093                 case IBT_WC_SUCCESS:
1094                         wd->status = RDMA_SUCCESS;
1095                         break;
1096                 case IBT_WC_WR_FLUSHED_ERR:
1097                         wd->status = RDMA_FAILED;
1098                         break;
1099                 default:
1100 /*
1101  *    RC Send Q Error Code              Local state     Remote State
1102  *    ====================              ===========     ============
1103  *    IBT_WC_BAD_RESPONSE_ERR             ERROR           None
1104  *    IBT_WC_LOCAL_LEN_ERR                ERROR           None
1105  *    IBT_WC_LOCAL_CHAN_OP_ERR            ERROR           None
1106  *    IBT_WC_LOCAL_PROTECT_ERR            ERROR           None
1107  *    IBT_WC_MEM_WIN_BIND_ERR             ERROR           None
1108  *    IBT_WC_REMOTE_INVALID_REQ_ERR       ERROR           ERROR
1109  *    IBT_WC_REMOTE_ACCESS_ERR            ERROR           ERROR
1110  *    IBT_WC_REMOTE_OP_ERR                ERROR           ERROR
1111  *    IBT_WC_RNR_NAK_TIMEOUT_ERR          ERROR           None
1112  *    IBT_WC_TRANS_TIMEOUT_ERR            ERROR           None
1113  *    IBT_WC_WR_FLUSHED_ERR               None            None
1114  */
1115 #ifdef DEBUG
1116         if (rib_debug > 1) {
1117             if (wc.wc_status != IBT_WC_SUCCESS) {
1118                     cmn_err(CE_NOTE, "rib_clnt_scq_handler: "
1119                         "WR completed in error, wc.wc_status:%d, "
1120                         "wc_id:%llx\n", wc.wc_status, (longlong_t)wc.wc_id);
1121             }
1122         }
1123 #endif
1124                         /*
1125                          * Channel in error state. Set connection to
1126                          * ERROR and cleanup will happen either from
1127                          * conn_release  or from rib_conn_get
1128                          */
1129                         wd->status = RDMA_FAILED;
1130                         mutex_enter(&conn->c_lock);
1131                         if (conn->c_state != C_DISCONN_PEND)
1132                                 conn->c_state = C_ERROR;
1133                         mutex_exit(&conn->c_lock);
1134                         break;
1135                 }
1136                 if (wd->cv_sig == 1) {
1137                         /*
1138                          * Notify poster
1139                          */
1140                         cmn_err(CE_NOTE,"Some error \n");
1141                         cv_signal(&wd->wait_cv);
1142                         mutex_exit(&wd->sendwait_lock);
1143                 } else {
1144                         /*
1145                          * Poster not waiting for notification.
1146                          * Free the send buffers and send_wid
1147                          */
1148                         for (i = 0; i < wd->nsbufs; i++) {
1149                                 rib_rbuf_free(qptoc(wd->qp), SEND_BUFFER,
1150                                         (void *)(uintptr_t)wd->sbufaddr[i]);
1151                         }
1152                         mutex_exit(&wd->sendwait_lock);
1153                         (void) rib_free_sendwait(wd);
1154                 }
1155 }
1156 #endif
1157 
1158 /* ARGSUSED */
1159 static void
1160 rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1161 {
1162         ibt_status_t    ibt_status;
1163         ibt_wc_t        wc;
1164         int             i;
1165 
1166         /*
1167          * Re-enable cq notify here to avoid missing any
1168          * completion queue notification.
1169          */
1170         (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1171 
1172         ibt_status = IBT_SUCCESS;
1173         while (ibt_status != IBT_CQ_EMPTY) {
1174             bzero(&wc, sizeof (wc));
1175             ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1176             if (ibt_status != IBT_SUCCESS)
1177                 return;
1178 
1179         /*
1180          * Got a send completion
1181          */
1182 #ifdef DEBUG
1183             if (rib_debug > 1 && wc.wc_status != IBT_WC_SUCCESS) {
1184                 cmn_err(CE_NOTE, "rib_svc_scq_handler: WR completed in error "
1185                         "wc.wc_status:%d, wc_id:%llX",
1186                         wc.wc_status, (longlong_t)wc.wc_id);
1187             }
1188 #endif
1189             if (wc.wc_id != NULL) { /* XXX NULL possible ???? */
1190                 struct send_wid *wd = (struct send_wid *)(uintptr_t)wc.wc_id;
1191 #ifdef ASYNC_SERVER_DEREG 
1192                 if(wd->c1){ 
1193                 (void) clist_deregister1((CONN *)wd->c, (struct clist *)wd->c1, TRUE); 
1194 #ifdef  SERVER_REG_CACHE 
1195                 RDMA_FREE_SERVER_CACHE_BUF((CONN *)wd->c, (rib_lrc_entry_t *)(((struct clist *)wd->c1)->long_reply_buf)); 
1196 #else 
1197                 if(wd->c1 && wd->l1) 
1198                 kmem_free((void *) (wd->c1)->c_saddr, wd->l1); 
1199 #endif 
1200                 kmem_free((void *)(wd->c1), wd->wl * sizeof(struct clist)); 
1201                 } 
1202                 if(wd->c2){ 
1203                 (void) clist_deregister1((CONN *)wd->c, (struct clist *)wd->c2, TRUE); 
1204 #ifdef  SERVER_REG_CACHE 
1205                 RDMA_FREE_SERVER_CACHE_BUF((CONN *)wd->c, (rib_lrc_entry_t *)(((struct clist *)wd->c2)->long_reply_buf)); 
1206 #else 
1207                 if(wd->l2) 
1208                 kmem_free((void *) (wd->c2)->c_saddr, wd->l2); 
1209 #endif 
1210                 kmem_free((void *)(wd->c2), wd->rl * sizeof(struct clist)); 
1211                 } 
1212 #endif 
1213                 mutex_enter(&wd->sendwait_lock);
1214                 if (wd->cv_sig == 1) {
1215                         /*
1216                          * Update completion status and notify poster
1217                          */
1218                         if (wc.wc_status == IBT_WC_SUCCESS)
1219                                 wd->status = RDMA_SUCCESS;
1220                         else
1221                                 wd->status = RDMA_FAILED;
1222                         cv_signal(&wd->wait_cv);
1223                         mutex_exit(&wd->sendwait_lock);
1224                 } else {
1225                         /*
1226                          * Poster not waiting for notification.
1227                          * Free the send buffers and send_wid
1228                          */
1229                         for (i = 0; i < wd->nsbufs; i++) {
1230                                 rib_rbuf_free(qptoc(wd->qp), SEND_BUFFER,
1231                                         (void *)(uintptr_t)wd->sbufaddr[i]);
1232                         }
1233                         mutex_exit(&wd->sendwait_lock);
1234                         (void) rib_free_sendwait(wd);
1235                 }
1236             }
1237         }
1238 }
1239 
1240 /*
1241  * RCQ handler
1242  */
1243 /* ARGSUSED */
1244 static void
1245 rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1246 {
1247         rib_qp_t        *qp;
1248         ibt_status_t    ibt_status;
1249         ibt_wc_t        wc;
1250         struct recv_wid *rwid;
1251 #if defined(CLNT_POLL_CQ)
1252         uint32_t        count = 0;
1253 #endif
1254 
1255         /*
1256          * Re-enable cq notify here to avoid missing any
1257          * completion queue notification.
1258          */
1259 #if !defined(CLNT_POLL_CQ)
1260         (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1261 #endif
1262 
1263         ibt_status = IBT_SUCCESS;
1264         while (ibt_status != IBT_CQ_EMPTY) {
1265 #if defined(CLNT_POLL_CQ)
1266                 poll_cq_again:
1267 #endif
1268                 bzero(&wc, sizeof (wc));
1269                 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1270 #if defined(CLNT_POLL_CQ)
1271                  if (ibt_status == IBT_CQ_EMPTY){
1272                                 count ++;
1273                  if(count == max_poll_count){
1274                                 (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1275                                 return;
1276                  }
1277                  goto poll_cq_again;
1278                  }
1279 #endif
1280                 if (ibt_status != IBT_SUCCESS)
1281 #if defined(CLNT_POLL_CQ)
1282                 {
1283                         (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1284 #endif
1285                     return;
1286 #if defined(CLNT_POLL_CQ) 
1287                 } 
1288                 count = 0; 
1289 #endif 
1290                 rwid = (struct recv_wid *)(uintptr_t)wc.wc_id;
1291                 qp = rwid->qp;
1292                 if (wc.wc_status == IBT_WC_SUCCESS) {
1293                     XDR                 inxdrs, *xdrs;
1294                     uint_t              xid, vers, op, find_xid = 0;
1295                     struct reply        *r;
1296                     CONN *conn = qptoc(qp);
1297                     uint32_t rdma_credit = 0;
1298 
1299                     xdrs = &inxdrs;
1300                     xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr,
1301                         wc.wc_bytes_xfer, XDR_DECODE);
1302                 /*
1303                  * Treat xid as opaque (xid is the first entity
1304                  * in the rpc rdma message).
1305                  */
1306                     xid = *(uint32_t *)(uintptr_t)rwid->addr;
1307                 /* Skip xid and set the xdr position accordingly. */
1308                     XDR_SETPOS(xdrs, sizeof (uint32_t));
1309                     (void) xdr_u_int(xdrs, &vers);
1310                     (void) xdr_u_int(xdrs, &rdma_credit);
1311                     (void) xdr_u_int(xdrs, &op);
1312                     XDR_DESTROY(xdrs);
1313                     if (vers != RPCRDMA_VERS) {
1314                         /*
1315                          * Invalid RPC/RDMA version. Cannot interoperate.
1316                          * Set connection to ERROR state and bail out.
1317                          */
1318                         mutex_enter(&conn->c_lock);
1319                         if (conn->c_state != C_DISCONN_PEND)
1320                                 conn->c_state = C_ERROR;
1321                         mutex_exit(&conn->c_lock);
1322                         rib_rbuf_free(conn, RECV_BUFFER,
1323                                 (void *)(uintptr_t)rwid->addr);
1324                         rib_free_wid(rwid);
1325                         continue;
1326                     }
1327 
1328                     mutex_enter(&qp->replylist_lock);
1329                     for (r = qp->replylist; r != NULL; r = r->next) {
1330                         if (r->xid == xid) {
1331                             find_xid = 1;
1332                             switch (op) {
1333                             case RDMA_MSG:
1334                             case RDMA_NOMSG:
1335                             case RDMA_MSGP:
1336                                 r->status = RDMA_SUCCESS;
1337                                 r->vaddr_cq = rwid->addr;
1338                                 r->bytes_xfer = wc.wc_bytes_xfer;
1339                                 cv_signal(&r->wait_cv);
1340                                 break;
1341                             default:
1342                                 rib_rbuf_free(qptoc(qp), RECV_BUFFER,
1343                                                 (void *)(uintptr_t)rwid->addr);
1344                                 break;
1345                             }
1346                             break;
1347                         }
1348                     }
1349                     mutex_exit(&qp->replylist_lock);
1350                     if (find_xid == 0) {
1351                         /* RPC caller not waiting for reply */
1352 #ifdef DEBUG
1353                             if (rib_debug) {
1354                         cmn_err(CE_NOTE, "rib_clnt_rcq_handler: "
1355                             "NO matching xid %u!\n", xid);
1356                             }
1357 #endif
1358                         rib_rbuf_free(qptoc(qp), RECV_BUFFER,
1359                                 (void *)(uintptr_t)rwid->addr);
1360                     }
1361                 } else if (wc.wc_status == IBT_WC_WR_FLUSHED_ERR) {
1362                         CONN *conn = qptoc(qp);
1363 
1364                         /*
1365                          * Connection being flushed. Just free
1366                          * the posted buffer
1367                          */
1368                         rib_rbuf_free(conn, RECV_BUFFER,
1369                                 (void *)(uintptr_t)rwid->addr);
1370                 } else {
1371                         CONN *conn = qptoc(qp);
1372 /*
1373  *  RC Recv Q Error Code                Local state     Remote State
1374  *  ====================                ===========     ============
1375  *  IBT_WC_LOCAL_ACCESS_ERR             ERROR           ERROR when NAK recvd
1376  *  IBT_WC_LOCAL_LEN_ERR                ERROR           ERROR when NAK recvd
1377  *  IBT_WC_LOCAL_PROTECT_ERR            ERROR           ERROR when NAK recvd
1378  *  IBT_WC_LOCAL_CHAN_OP_ERR            ERROR           ERROR when NAK recvd
1379  *  IBT_WC_REMOTE_INVALID_REQ_ERR       ERROR           ERROR when NAK recvd
1380  *  IBT_WC_WR_FLUSHED_ERR               None            None
1381  */
1382                         /*
1383                          * Channel in error state. Set connection
1384                          * in ERROR state.
1385                          */
1386                         mutex_enter(&conn->c_lock);
1387                         if (conn->c_state != C_DISCONN_PEND)
1388                                 conn->c_state = C_ERROR;
1389                         mutex_exit(&conn->c_lock);
1390                         rib_rbuf_free(conn, RECV_BUFFER,
1391                                 (void *)(uintptr_t)rwid->addr);
1392                 }
1393                 rib_free_wid(rwid);
1394         }
1395 }
1396 
1397 /* Server side */
1398 /* ARGSUSED */
1399 static void
1400 rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1401 {
1402         struct recv_data *rd;
1403         rib_qp_t        *qp;
1404         ibt_status_t    ibt_status;
1405         ibt_wc_t        wc;
1406         struct svc_recv *s_recvp;
1407         CONN            *conn;
1408         mblk_t          *mp;
1409 
1410         /*
1411          * Re-enable cq notify here to avoid missing any
1412          * completion queue notification.
1413          */
1414         (void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1415 
1416         ibt_status = IBT_SUCCESS;
1417         while (ibt_status != IBT_CQ_EMPTY) {
1418                 bzero(&wc, sizeof (wc));
1419                 ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1420                 if (ibt_status != IBT_SUCCESS)
1421                     return;
1422 
1423                 s_recvp = (struct svc_recv *)(uintptr_t)wc.wc_id;
1424                 qp = s_recvp->qp;
1425                 conn = qptoc(qp);
1426                 mutex_enter(&qp->posted_rbufs_lock);
1427                 qp->n_posted_rbufs--;
1428 #if defined(MEASURE_POOL_DEPTH)
1429                rib_posted_rbufs(preposted_rbufs -  qp->n_posted_rbufs);
1430 #endif
1431                 if (qp->n_posted_rbufs == 0)
1432                         cv_signal(&qp->posted_rbufs_cv);
1433                 mutex_exit(&qp->posted_rbufs_lock);
1434 
1435                 if (wc.wc_status == IBT_WC_SUCCESS) {
1436                     XDR         inxdrs, *xdrs;
1437                     uint_t      xid, vers, op;
1438                     uint32_t rdma_credit;
1439 
1440                     xdrs = &inxdrs;
1441                     /* s_recvp->vaddr stores data */
1442                     xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr,
1443                         wc.wc_bytes_xfer, XDR_DECODE);
1444 
1445                 /*
1446                  * Treat xid as opaque (xid is the first entity
1447                  * in the rpc rdma message).
1448                  */
1449                     xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr;
1450                 /* Skip xid and set the xdr position accordingly. */
1451                     XDR_SETPOS(xdrs, sizeof (uint32_t));
1452                     if (!xdr_u_int(xdrs, &vers) ||
1453                         !xdr_u_int(xdrs, &rdma_credit) ||
1454                         !xdr_u_int(xdrs, &op)) {
1455                         rib_rbuf_free(conn, RECV_BUFFER,
1456                                 (void *)(uintptr_t)s_recvp->vaddr);
1457                         XDR_DESTROY(xdrs);
1458 #ifdef DEBUG
1459                         cmn_err(CE_NOTE, "rib_svc_rcq_handler: "
1460                             "xdr_u_int failed for qp %p, wc_id=%llx",
1461                             (void *)qp, (longlong_t)wc.wc_id);
1462 #endif
1463                         (void) rib_free_svc_recv(s_recvp);
1464                         continue;
1465                     }
1466                     XDR_DESTROY(xdrs);
1467 
1468                     if (vers != RPCRDMA_VERS) {
1469                         /*
1470                          * Invalid RPC/RDMA version. Drop rpc rdma message.
1471                          */
1472                         rib_rbuf_free(conn, RECV_BUFFER,
1473                                 (void *)(uintptr_t)s_recvp->vaddr);
1474                         (void) rib_free_svc_recv(s_recvp);
1475                         continue;
1476                     }
1477                         /*
1478                          * Is this for RDMA_DONE?
1479                          */
1480                     if (op == RDMA_DONE) {
1481                         rib_rbuf_free(conn, RECV_BUFFER,
1482                                 (void *)(uintptr_t)s_recvp->vaddr);
1483                         /*
1484                          * Wake up the thread waiting on
1485                          * a RDMA_DONE for xid
1486                          */
1487                         mutex_enter(&qp->rdlist_lock);
1488                         rdma_done_notify(qp, xid);
1489                         mutex_exit(&qp->rdlist_lock);
1490                         (void) rib_free_svc_recv(s_recvp);
1491                         continue;
1492                     }
1493 
1494                     mutex_enter(&plugin_state_lock);
1495                     if (plugin_state == ACCEPT) {
1496                         while ((mp = allocb(sizeof (*rd), BPRI_LO)) == NULL)
1497                             (void) strwaitbuf(sizeof (*rd), BPRI_LO);
1498                         /*
1499                          * Plugin is in accept state, hence the master
1500                          * transport queue for this is still accepting
1501                          * requests. Hence we can call svc_queuereq to
1502                          * queue this recieved msg.
1503                          */
1504                         rd = (struct recv_data *)mp->b_rptr;
1505                         rd->conn = conn;
1506                         rd->rpcmsg.addr = (caddr_t)(uintptr_t)s_recvp->vaddr;
1507                         rd->rpcmsg.type = RECV_BUFFER;
1508                         rd->rpcmsg.len = wc.wc_bytes_xfer;
1509                         rd->status = wc.wc_status;
1510                         mutex_enter(&conn->c_lock);
1511                         conn->c_ref++;
1512                         mutex_exit(&conn->c_lock);
1513                         mp->b_wptr += sizeof (*rd);
1514                         svc_queuereq((queue_t *)rib_stat->q, mp);
1515                         mutex_exit(&plugin_state_lock);
1516                     } else {
1517                         /*
1518                          * The master transport for this is going
1519                          * away and the queue is not accepting anymore
1520                          * requests for krpc, so don't do anything, just
1521                          * free the msg.
1522                          */
1523                         mutex_exit(&plugin_state_lock);
1524                         rib_rbuf_free(conn, RECV_BUFFER,
1525                         (void *)(uintptr_t)s_recvp->vaddr);
1526                     }
1527                 } else {
1528                         rib_rbuf_free(conn, RECV_BUFFER,
1529                                 (void *)(uintptr_t)s_recvp->vaddr);
1530                 }
1531                 (void) rib_free_svc_recv(s_recvp);
1532         }
1533 }
1534 
1535 /*
1536  * Handles DR event of IBT_HCA_DETACH_EVENT.
1537  */
1538 /* ARGSUSED */
1539 static void
1540 rib_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
1541         ibt_async_code_t code, ibt_async_event_t *event)
1542 {
1543 
1544         switch (code) {
1545         case IBT_HCA_ATTACH_EVENT:
1546                 /* ignore */
1547                 break;
1548         case IBT_HCA_DETACH_EVENT:
1549         {
1550                 ASSERT(rib_stat->hca->hca_hdl == hca_hdl);
1551                 rib_detach_hca(rib_stat->hca);
1552 #ifdef DEBUG
1553         cmn_err(CE_NOTE, "rib_async_handler(): HCA being detached!\n");
1554 #endif
1555                 break;
1556         }
1557 #ifdef DEBUG
1558         case IBT_EVENT_PATH_MIGRATED:
1559         cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PATH_MIGRATED\n");
1560                 break;
1561         case IBT_EVENT_SQD:
1562         cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_SQD\n");
1563                 break;
1564         case IBT_EVENT_COM_EST:
1565         cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_COM_EST\n");
1566                 break;
1567         case IBT_ERROR_CATASTROPHIC_CHAN:
1568         cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CATASTROPHIC_CHAN\n");
1569                 break;
1570         case IBT_ERROR_INVALID_REQUEST_CHAN:
1571         cmn_err(CE_NOTE, "rib_async_handler(): "
1572                 "IBT_ERROR_INVALID_REQUEST_CHAN\n");
1573                 break;
1574         case IBT_ERROR_ACCESS_VIOLATION_CHAN:
1575         cmn_err(CE_NOTE, "rib_async_handler(): "
1576                 "IBT_ERROR_ACCESS_VIOLATION_CHAN\n");
1577                 break;
1578         case IBT_ERROR_PATH_MIGRATE_REQ:
1579         cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PATH_MIGRATE_REQ\n");
1580                 break;
1581         case IBT_ERROR_CQ:
1582         cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CQ\n");
1583                 break;
1584         case IBT_ERROR_PORT_DOWN:
1585         cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PORT_DOWN\n");
1586                 break;
1587         case IBT_EVENT_PORT_UP:
1588         cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PORT_UP\n");
1589                 break;
1590         case IBT_ASYNC_OPAQUE1:
1591         cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE1\n");
1592                 break;
1593         case IBT_ASYNC_OPAQUE2:
1594         cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE2\n");
1595                 break;
1596         case IBT_ASYNC_OPAQUE3:
1597         cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE3\n");
1598                 break;
1599         case IBT_ASYNC_OPAQUE4:
1600         cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE4\n");
1601                 break;
1602 #endif
1603         default:
1604                 break;
1605         }
1606 }
1607 
1608 /*
1609  * Client's reachable function.
1610  */
1611 static rdma_stat
1612 rib_reachable(int addr_type, struct netbuf *raddr, void **handle)
1613 {
1614         rib_hca_t       *hca;
1615         rdma_stat       status;
1616 
1617         /*
1618          * First check if a hca is still attached
1619          */
1620         *handle = NULL;
1621         rw_enter(&rib_stat->hca->state_lock, RW_READER);
1622         if (rib_stat->hca->state != HCA_INITED) {
1623                 rw_exit(&rib_stat->hca->state_lock);
1624                 return (RDMA_FAILED);
1625         }
1626         status = rib_ping_srv(addr_type, raddr, &hca);
1627         rw_exit(&rib_stat->hca->state_lock);
1628 
1629         if (status == RDMA_SUCCESS) {
1630                 *handle = (void *)hca;
1631                 /*
1632                  * Register the Address translation service
1633                  */
1634                 mutex_enter(&rib_stat->open_hca_lock);
1635                 if (ats_running == 0) {
1636                         if (rib_register_ats(rib_stat->hca)
1637                             == RDMA_SUCCESS) {
1638                                 ats_running = 1;
1639                                 mutex_exit(&rib_stat->open_hca_lock);
1640                                 return (RDMA_SUCCESS);
1641                         } else {
1642                                 mutex_exit(&rib_stat->open_hca_lock);
1643                                 return (RDMA_FAILED);
1644                         }
1645                 } else {
1646                         mutex_exit(&rib_stat->open_hca_lock);
1647                         return (RDMA_SUCCESS);
1648                 }
1649         } else {
1650                 *handle = NULL;
1651                 if (rib_debug > 2)
1652                     cmn_err(CE_WARN, "rib_reachable(): ping_srv failed.\n");
1653                 return (RDMA_FAILED);
1654         }
1655 }
1656 
1657 /* Client side qp creation */
1658 static rdma_stat
1659 rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp)
1660 {
1661         rib_qp_t        *kqp = NULL;
1662         CONN            *conn;
1663         rdma_clnt_cred_ctrl_t *cc_info;
1664 
1665         ASSERT(qp != NULL);
1666         *qp = NULL;
1667 
1668         kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1669         conn = qptoc(kqp);
1670         kqp->hca = hca;
1671         kqp->rdmaconn.c_rdmamod = &rib_mod;
1672         kqp->rdmaconn.c_private = (caddr_t)kqp;
1673 
1674         kqp->mode = RIB_CLIENT;
1675         kqp->chan_flags = IBT_BLOCKING;
1676         conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP);
1677         bcopy(raddr->buf, conn->c_raddr.buf, raddr->len);
1678         conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len;
1679 
1680         /*
1681          * Initialize
1682          */
1683         cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1684         cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1685         mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1686         mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock);
1687         mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1688         mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1689         cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1690         mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1691 #if defined (CLNT_INTERRUPT_COAL)
1692         kqp->rdmaconn.c_count = 0;
1693         conn->c_count = 0;
1694         bzero(&kqp->wd, sizeof(struct send_wid));
1695         kqp->wd.forw = kqp->wd.back = &kqp->wd;
1696 #endif
1697         /*
1698          * Initialize the client credit control
1699          * portion of the rdmaconn struct.
1700          */
1701         kqp->rdmaconn.c_cc_type = RDMA_CC_CLNT;
1702         cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
1703         cc_info->clnt_cc_granted_ops = 0;
1704         cc_info->clnt_cc_in_flight_ops = 0;
1705         cv_init(&cc_info->clnt_cc_cv, NULL, CV_DEFAULT, NULL);
1706 
1707         *qp = kqp;
1708         return (RDMA_SUCCESS);
1709 }
1710 
1711 /* Server side qp creation */
1712 static rdma_stat
1713 rib_svc_create_chan(rib_hca_t *hca, caddr_t q, uint8_t port, rib_qp_t **qp)
1714 {
1715         rib_qp_t        *kqp = NULL;
1716         ibt_chan_sizes_t        chan_sizes;
1717         ibt_rc_chan_alloc_args_t        qp_attr;
1718         ibt_status_t            ibt_status;
1719         rdma_srv_cred_ctrl_t *cc_info;
1720 
1721         ASSERT(qp != NULL);
1722         *qp = NULL;
1723 
1724         kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1725         kqp->hca = hca;
1726         kqp->port_num = port;
1727         kqp->rdmaconn.c_rdmamod = &rib_mod;
1728         kqp->rdmaconn.c_private = (caddr_t)kqp;
1729 
1730         /*
1731          * Create the qp handle
1732          */
1733         bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1734         qp_attr.rc_scq = hca->svc_scq->rib_cq_hdl;
1735         qp_attr.rc_rcq = hca->svc_rcq->rib_cq_hdl;
1736         qp_attr.rc_pd = hca->pd_hdl;
1737         qp_attr.rc_hca_port_num = port;
1738         qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1739         qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1740         qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1741         qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1742         qp_attr.rc_clone_chan = NULL;
1743         qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1744         qp_attr.rc_flags = IBT_WR_SIGNALED;
1745 
1746         rw_enter(&hca->state_lock, RW_READER);
1747         if (hca->state != HCA_DETACHED) {
1748                 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1749                         IBT_ACHAN_NO_FLAGS, &qp_attr, &kqp->qp_hdl,
1750                         &chan_sizes);
1751         } else {
1752                 rw_exit(&hca->state_lock);
1753                 goto fail;
1754         }
1755         rw_exit(&hca->state_lock);
1756 
1757         if (ibt_status != IBT_SUCCESS) {
1758                 cmn_err(CE_WARN, "rib_svc_create_chan: "
1759                         "ibt_alloc_rc_channel failed, ibt_status=%d.",
1760                         ibt_status);
1761                 goto fail;
1762         }
1763 
1764         kqp->mode = RIB_SERVER;
1765         kqp->chan_flags = IBT_BLOCKING;
1766         kqp->q = q;  /* server ONLY */
1767 
1768         cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1769         cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1770         mutex_init(&kqp->replylist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1771         mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1772         mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1773         mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1774         cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1775         mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1776         /*
1777          * Set the private data area to qp to be used in callbacks
1778          */
1779         ibt_set_chan_private(kqp->qp_hdl, (void *)kqp);
1780         kqp->rdmaconn.c_state = C_CONNECTED;
1781 
1782         /*
1783          * Initialize the server credit control
1784          * portion of the rdmaconn struct.
1785          */
1786         kqp->rdmaconn.c_cc_type = RDMA_CC_SRV;
1787         cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_srv_cc;
1788         cc_info->srv_cc_buffers_granted = preposted_rbufs;
1789         cc_info->srv_cc_cur_buffers_used = 0;
1790         cc_info->srv_cc_posted = preposted_rbufs;
1791 
1792         *qp = kqp;
1793 
1794         num_clients++;
1795         return (RDMA_SUCCESS);
1796 fail:
1797         if (kqp)
1798                 kmem_free(kqp, sizeof (rib_qp_t));
1799 
1800         return (RDMA_FAILED);
1801 }
1802 
1803 void
1804 rib_dump_pathrec(ibt_path_info_t *path_rec)
1805 {
1806         ib_pkey_t       pkey;
1807 
1808         if (rib_debug > 1) {
1809             cmn_err(CE_NOTE, "Path Record:\n");
1810 
1811             cmn_err(CE_NOTE, "Source HCA GUID = %llx\n",
1812                 (longlong_t)path_rec->pi_hca_guid);
1813             cmn_err(CE_NOTE, "Dest Service ID = %llx\n",
1814                 (longlong_t)path_rec->pi_sid);
1815             cmn_err(CE_NOTE, "Port Num        = %02d\n",
1816                 path_rec->pi_prim_cep_path.cep_hca_port_num);
1817             cmn_err(CE_NOTE, "P_Key Index     = %04d\n",
1818                 path_rec->pi_prim_cep_path.cep_pkey_ix);
1819 
1820             (void) ibt_index2pkey_byguid(path_rec->pi_hca_guid,
1821                         path_rec->pi_prim_cep_path.cep_hca_port_num,
1822                         path_rec->pi_prim_cep_path.cep_pkey_ix, &pkey);
1823             cmn_err(CE_NOTE, "P_Key             = 0x%x\n", pkey);
1824 
1825 
1826             cmn_err(CE_NOTE, "SGID:           = %llx:%llx\n",
1827                 (longlong_t)
1828                 path_rec->pi_prim_cep_path.cep_adds_vect.av_sgid.gid_prefix,
1829                 (longlong_t)
1830                 path_rec->pi_prim_cep_path.cep_adds_vect.av_sgid.gid_guid);
1831 
1832             cmn_err(CE_NOTE, "DGID:           = %llx:%llx\n",
1833                 (longlong_t)
1834                 path_rec->pi_prim_cep_path.cep_adds_vect.av_dgid.gid_prefix,
1835                 (longlong_t)
1836                 path_rec->pi_prim_cep_path.cep_adds_vect.av_dgid.gid_guid);
1837 
1838             cmn_err(CE_NOTE, "Path Rate       = %02x\n",
1839                 path_rec->pi_prim_cep_path.cep_adds_vect.av_srate);
1840             cmn_err(CE_NOTE, "SL              = %02x\n",
1841                 path_rec->pi_prim_cep_path.cep_adds_vect.av_srvl);
1842             cmn_err(CE_NOTE, "Prim Packet LT  = %02x\n",
1843                 path_rec->pi_prim_pkt_lt);
1844             cmn_err(CE_NOTE, "Path MTU        = %02x\n",
1845                 path_rec->pi_path_mtu);
1846         }
1847 }
1848 
1849 /* ARGSUSED */
1850 ibt_cm_status_t
1851 rib_clnt_cm_handler(void *clnt_hdl, ibt_cm_event_t *event,
1852     ibt_cm_return_args_t *ret_args, void *priv_data,
1853     ibt_priv_data_len_t len)
1854 {
1855         rpcib_state_t   *ribstat;
1856         rib_hca_t       *hca;
1857 
1858         ribstat = (rpcib_state_t *)clnt_hdl;
1859         hca = (rib_hca_t *)ribstat->hca;
1860 
1861         switch (event->cm_type) {
1862 
1863         /* got a connection close event */
1864         case IBT_CM_EVENT_CONN_CLOSED:
1865         {
1866                 CONN    *conn;
1867                 rib_qp_t *qp;
1868 
1869                 /* check reason why connection was closed */
1870                 switch (event->cm_event.closed) {
1871                 case IBT_CM_CLOSED_DREP_RCVD:
1872                 case IBT_CM_CLOSED_DREQ_TIMEOUT:
1873                 case IBT_CM_CLOSED_DUP:
1874                 case IBT_CM_CLOSED_ABORT:
1875                 case IBT_CM_CLOSED_ALREADY:
1876                         /*
1877                          * These cases indicate the local end initiated
1878                          * the closing of the channel. Nothing to do here.
1879                          */
1880                         break;
1881                 default:
1882                         /*
1883                          * Reason for CONN_CLOSED event must be one of
1884                          * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
1885                          * or IBT_CM_CLOSED_STALE. These indicate cases were
1886                          * the remote end is closing the channel. In these
1887                          * cases free the channel and transition to error
1888                          * state
1889                          */
1890                         qp = ibt_get_chan_private(event->cm_channel);
1891                         conn = qptoc(qp);
1892                         mutex_enter(&conn->c_lock);
1893                         if (conn->c_state == C_DISCONN_PEND) {
1894                                 mutex_exit(&conn->c_lock);
1895                                 break;
1896                         }
1897 
1898                         conn->c_state = C_ERROR;
1899 
1900                         /*
1901                          * Free the rc_channel. Channel has already
1902                          * transitioned to ERROR state and WRs have been
1903                          * FLUSHED_ERR already.
1904                          */
1905                         (void) ibt_free_channel(qp->qp_hdl);
1906                         qp->qp_hdl = NULL;
1907 
1908                         /*
1909                          * Free the conn if c_ref is down to 0 already
1910                          */
1911                         if (conn->c_ref == 0) {
1912                                 /*
1913                                  * Remove from list and free conn
1914                                  */
1915                                 conn->c_state = C_DISCONN_PEND;
1916                                 mutex_exit(&conn->c_lock);
1917                                 (void) rib_disconnect_channel(conn,
1918                                         &hca->cl_conn_list);
1919                         } else {
1920                                 mutex_exit(&conn->c_lock);
1921                         }
1922 #ifdef DEBUG
1923                         if (rib_debug)
1924                                 cmn_err(CE_NOTE, "rib_clnt_cm_handler: "
1925                                         "(CONN_CLOSED) channel disconnected");
1926 #endif
1927                         break;
1928                 }
1929                 break;
1930         }
1931         default:
1932                 break;
1933         }
1934         return (IBT_CM_ACCEPT);
1935 }
1936 
1937 
1938 /* Check if server has done ATS registration */
1939 rdma_stat
1940 rib_chk_srv_ats(rib_hca_t *hca, struct netbuf *raddr,
1941         int addr_type, ibt_path_info_t *path)
1942 {
1943         struct sockaddr_in      *sin4;
1944         struct sockaddr_in6     *sin6;
1945         ibt_path_attr_t         path_attr;
1946         ibt_status_t            ibt_status;
1947         ib_pkey_t               pkey;
1948         ibt_ar_t                ar_query, ar_result;
1949         rib_service_t           *ats;
1950         ib_gid_t                sgid;
1951         ibt_path_info_t         paths[MAX_PORTS];
1952         uint8_t                 npaths, i;
1953 
1954         (void) bzero(&path_attr, sizeof (ibt_path_attr_t));
1955         (void) bzero(path, sizeof (ibt_path_info_t));
1956 
1957         /*
1958          * Construct svc name
1959          */
1960         path_attr.pa_sname = kmem_zalloc(IB_SVC_NAME_LEN, KM_SLEEP);
1961         switch (addr_type) {
1962         case AF_INET:
1963                 sin4 = (struct sockaddr_in *)raddr->buf;
1964                 (void) inet_ntop(AF_INET, &sin4->sin_addr, path_attr.pa_sname,
1965                     IB_SVC_NAME_LEN);
1966                 break;
1967 
1968         case AF_INET6:
1969                 sin6 = (struct sockaddr_in6 *)raddr->buf;
1970                 (void) inet_ntop(AF_INET6, &sin6->sin6_addr,
1971                     path_attr.pa_sname, IB_SVC_NAME_LEN);
1972                 break;
1973 
1974         default:
1975                 kmem_free(path_attr.pa_sname, IB_SVC_NAME_LEN);
1976                 return (RDMA_INVAL);
1977         }
1978         (void) strlcat(path_attr.pa_sname, "::NFS", IB_SVC_NAME_LEN);
1979 
1980         /*
1981          * Attempt a path to the server on an ATS-registered port.
1982          * Try all ATS-registered ports until one succeeds.
1983          * The first one that succeeds will be used to connect
1984          * to the server.  If none of them succeed, return RDMA_FAILED.
1985          */
1986         rw_enter(&hca->state_lock, RW_READER);
1987         if (hca->state != HCA_DETACHED) {
1988             rw_enter(&hca->service_list_lock, RW_READER);
1989             for (ats = hca->ats_list; ats != NULL; ats = ats->srv_next) {
1990                 path_attr.pa_hca_guid = hca->hca_guid;
1991                 path_attr.pa_hca_port_num = ats->srv_port;
1992                 ibt_status = ibt_get_paths(hca->ibt_clnt_hdl,
1993                         IBT_PATH_MULTI_SVC_DEST, &path_attr, 2, paths, &npaths);
1994                 if (ibt_status == IBT_SUCCESS ||
1995                         ibt_status == IBT_INSUFF_DATA) {
1996                     for (i = 0; i < npaths; i++) {
1997                         if (paths[i].pi_hca_guid) {
1998                         /*
1999                          * do ibt_query_ar()
2000                          */
2001                             sgid =
2002                                 paths[i].pi_prim_cep_path.cep_adds_vect.av_sgid;
2003 
2004                             (void) ibt_index2pkey_byguid(paths[i].pi_hca_guid,
2005                                 paths[i].pi_prim_cep_path.cep_hca_port_num,
2006                                 paths[i].pi_prim_cep_path.cep_pkey_ix, &pkey);
2007 
2008                             bzero(&ar_query, sizeof (ar_query));
2009                             bzero(&ar_result, sizeof (ar_result));
2010                             ar_query.ar_gid =
2011                                 paths[i].pi_prim_cep_path.cep_adds_vect.av_dgid;
2012                             ar_query.ar_pkey = pkey;
2013                             ibt_status = ibt_query_ar(&sgid, &ar_query,
2014                                         &ar_result);
2015                             if (ibt_status == IBT_SUCCESS) {
2016 #ifdef DEBUG
2017                                 if (rib_debug > 1)
2018                                     rib_dump_pathrec(&paths[i]);
2019 #endif
2020                                 bcopy(&paths[i], path,
2021                                         sizeof (ibt_path_info_t));
2022                                 rw_exit(&hca->service_list_lock);
2023                                 kmem_free(path_attr.pa_sname, IB_SVC_NAME_LEN);
2024                                 rw_exit(&hca->state_lock);
2025                                 return (RDMA_SUCCESS);
2026                             }
2027 #ifdef DEBUG
2028                             if (rib_debug) {
2029                                 cmn_err(CE_NOTE, "rib_chk_srv_ats: "
2030                                     "ibt_query_ar FAILED, return\n");
2031                             }
2032 #endif
2033                         }
2034                     }
2035                 }
2036             }
2037             rw_exit(&hca->service_list_lock);
2038         }
2039         kmem_free(path_attr.pa_sname, IB_SVC_NAME_LEN);
2040         rw_exit(&hca->state_lock);
2041         return (RDMA_FAILED);
2042 }
2043 
2044 
2045 /*
2046  * Connect to the server.
2047  */
2048 rdma_stat
2049 rib_conn_to_srv(rib_hca_t *hca, rib_qp_t *qp, ibt_path_info_t *path)
2050 {
2051         ibt_chan_open_args_t    chan_args;      /* channel args */
2052         ibt_chan_sizes_t        chan_sizes;
2053         ibt_rc_chan_alloc_args_t        qp_attr;
2054         ibt_status_t            ibt_status;
2055         ibt_rc_returns_t        ret_args;       /* conn reject info */
2056         int refresh = REFRESH_ATTEMPTS; /* refresh if IBT_CM_CONN_STALE */
2057 
2058         (void) bzero(&chan_args, sizeof (chan_args));
2059         (void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
2060 
2061         qp_attr.rc_hca_port_num = path->pi_prim_cep_path.cep_hca_port_num;
2062         /* Alloc a RC channel */
2063         qp_attr.rc_scq = hca->clnt_scq->rib_cq_hdl; 
2064         qp_attr.rc_rcq = hca->clnt_rcq->rib_cq_hdl;
2065         qp_attr.rc_pd = hca->pd_hdl;
2066         qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
2067         qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
2068         qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
2069         qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
2070         qp_attr.rc_clone_chan = NULL;
2071         qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
2072         qp_attr.rc_flags = IBT_WR_SIGNALED;
2073 
2074         chan_args.oc_path = path;
2075         chan_args.oc_cm_handler = rib_clnt_cm_handler;
2076         chan_args.oc_cm_clnt_private = (void *)rib_stat;
2077         chan_args.oc_rdma_ra_out = 4; 
2078         chan_args.oc_rdma_ra_in = 4; 
2079         chan_args.oc_path_retry_cnt = 2;
2080         chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES;
2081 
2082 refresh:
2083         rw_enter(&hca->state_lock, RW_READER);
2084         if (hca->state != HCA_DETACHED) {
2085                 ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
2086                         IBT_ACHAN_NO_FLAGS, &qp_attr, &qp->qp_hdl,
2087                         &chan_sizes);
2088         } else {
2089                 rw_exit(&hca->state_lock);
2090                 return (RDMA_FAILED);
2091         }
2092         rw_exit(&hca->state_lock);
2093 
2094         if (ibt_status != IBT_SUCCESS) {
2095 #ifdef DEBUG
2096                 cmn_err(CE_WARN, "rib_conn_to_srv: alloc_rc_channel "
2097                 "failed, ibt_status=%d.", ibt_status);
2098 #endif
2099                 return (RDMA_FAILED);
2100         }
2101 
2102         /* Connect to the Server */
2103         (void) bzero(&ret_args, sizeof (ret_args));
2104         mutex_enter(&qp->cb_lock);
2105         ibt_status = ibt_open_rc_channel(qp->qp_hdl, IBT_OCHAN_NO_FLAGS,
2106                         IBT_BLOCKING, &chan_args, &ret_args);
2107         if (ibt_status != IBT_SUCCESS) {
2108 #ifdef DEBUG
2109                 if (rib_debug)
2110                         cmn_err(CE_WARN, "rib_conn_to_srv: open_rc_channel"
2111                                 " failed for qp %p, status=%d, "
2112                                 "ret_args.rc_status=%d\n",
2113                                 (void *)qp, ibt_status, ret_args.rc_status);
2114 #endif
2115                 (void) ibt_free_channel(qp->qp_hdl);
2116                 qp->qp_hdl = NULL;
2117                 mutex_exit(&qp->cb_lock);
2118                 if (refresh-- && ibt_status == IBT_CM_FAILURE &&
2119                         ret_args.rc_status == IBT_CM_CONN_STALE) {
2120                         /*
2121                          * Got IBT_CM_CONN_STALE probably because of stale
2122                          * data on the passive end of a channel that existed
2123                          * prior to reboot. Retry establishing a channel
2124                          * REFRESH_ATTEMPTS times, during which time the
2125                          * stale conditions on the server might clear up.
2126                          */
2127                         goto refresh;
2128                 }
2129                 return (RDMA_FAILED);
2130         }
2131         mutex_exit(&qp->cb_lock);
2132         /*
2133          * Set the private data area to qp to be used in callbacks
2134          */
2135         ibt_set_chan_private(qp->qp_hdl, (void *)qp);
2136         return (RDMA_SUCCESS);
2137 }
2138 
2139 rdma_stat
2140 rib_ping_srv(int addr_type, struct netbuf *raddr, rib_hca_t **hca)
2141 {
2142         struct sockaddr_in      *sin4;
2143         struct sockaddr_in6     *sin6;
2144         ibt_path_attr_t         path_attr;
2145         ibt_path_info_t         path;
2146         ibt_status_t            ibt_status;
2147 
2148         ASSERT(raddr->buf != NULL);
2149 
2150         bzero(&path_attr, sizeof (ibt_path_attr_t));
2151         bzero(&path, sizeof (ibt_path_info_t));
2152 
2153         /*
2154          * Conctruct svc name
2155          */
2156         path_attr.pa_sname = kmem_zalloc(IB_SVC_NAME_LEN, KM_SLEEP);
2157         switch (addr_type) {
2158         case AF_INET:
2159                 sin4 = (struct sockaddr_in *)raddr->buf;
2160                 (void) inet_ntop(AF_INET, &sin4->sin_addr, path_attr.pa_sname,
2161                     IB_SVC_NAME_LEN);
2162                 break;
2163 
2164         case AF_INET6:
2165                 sin6 = (struct sockaddr_in6 *)raddr->buf;
2166                 (void) inet_ntop(AF_INET6, &sin6->sin6_addr,
2167                     path_attr.pa_sname, IB_SVC_NAME_LEN);
2168                 break;
2169 
2170         default:
2171 #ifdef  DEBUG
2172             if (rib_debug) {
2173                 cmn_err(CE_WARN, "rib_ping_srv: Address not recognized\n");
2174             }
2175 #endif
2176                 kmem_free(path_attr.pa_sname, IB_SVC_NAME_LEN);
2177                 return (RDMA_INVAL);
2178         }
2179         (void) strlcat(path_attr.pa_sname, "::NFS", IB_SVC_NAME_LEN);
2180 
2181         ibt_status = ibt_get_paths(rib_stat->ibt_clnt_hdl,
2182                 IBT_PATH_NO_FLAGS, &path_attr, 1, &path, NULL);
2183         kmem_free(path_attr.pa_sname, IB_SVC_NAME_LEN);
2184         if (ibt_status != IBT_SUCCESS) {
2185             if (rib_debug > 1) {
2186                 cmn_err(CE_WARN, "rib_ping_srv: ibt_get_paths FAILED!"
2187                         " status=%d\n", ibt_status);
2188             }
2189         } else if (path.pi_hca_guid) {
2190                 ASSERT(path.pi_hca_guid == rib_stat->hca->hca_guid);
2191                 *hca = rib_stat->hca;
2192                 return (RDMA_SUCCESS);
2193         }
2194         return (RDMA_FAILED);
2195 }
2196 
2197 /*
2198  * Close channel, remove from connection list and
2199  * free up resources allocated for that channel.
2200  */
2201 rdma_stat
2202 rib_disconnect_channel(CONN *conn, rib_conn_list_t *conn_list)
2203 {
2204         rib_qp_t        *qp = ctoqp(conn);
2205         rib_hca_t       *hca;
2206 
2207         /*
2208          * c_ref == 0 and connection is in C_DISCONN_PEND
2209          */
2210         hca = qp->hca;
2211         if (conn_list != NULL)
2212                 (void) rib_rm_conn(conn, conn_list);
2213         if (qp->qp_hdl != NULL) {
2214                 /*
2215                  * If the channel has not been establised,
2216                  * ibt_flush_channel is called to flush outstanding WRs
2217                  * on the Qs.  Otherwise, ibt_close_rc_channel() is
2218                  * called.  The channel is then freed.
2219                  */
2220                 if (conn_list != NULL)
2221                     (void) ibt_close_rc_channel(qp->qp_hdl,
2222                         IBT_BLOCKING, NULL, 0, NULL, NULL, 0);
2223                 else
2224                     (void) ibt_flush_channel(qp->qp_hdl);
2225 
2226                 mutex_enter(&qp->posted_rbufs_lock);
2227                 while (qp->n_posted_rbufs)
2228                         cv_wait(&qp->posted_rbufs_cv, &qp->posted_rbufs_lock);
2229                 mutex_exit(&qp->posted_rbufs_lock);
2230                 (void) ibt_free_channel(qp->qp_hdl);
2231                 qp->qp_hdl = NULL;
2232         }
2233         ASSERT(qp->rdlist == NULL);
2234         if (qp->replylist != NULL) {
2235                 (void) rib_rem_replylist(qp);
2236         }
2237 
2238         cv_destroy(&qp->cb_conn_cv);
2239         cv_destroy(&qp->posted_rbufs_cv);
2240         mutex_destroy(&qp->cb_lock);
2241 
2242         mutex_destroy(&qp->replylist_lock);
2243         mutex_destroy(&qp->posted_rbufs_lock);
2244         mutex_destroy(&qp->rdlist_lock);
2245 
2246         cv_destroy(&conn->c_cv);
2247         mutex_destroy(&conn->c_lock);
2248 
2249         if (conn->c_raddr.buf != NULL) {
2250                 kmem_free(conn->c_raddr.buf, conn->c_raddr.len);
2251         }
2252         if (conn->c_laddr.buf != NULL) {
2253                 kmem_free(conn->c_laddr.buf, conn->c_laddr.len);
2254         }
2255 
2256         /*
2257          * Credit control cleanup.
2258          */
2259         if (qp->rdmaconn.c_cc_type == RDMA_CC_CLNT) {
2260                 rdma_clnt_cred_ctrl_t *cc_info;
2261                 cc_info = &qp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
2262                 cv_destroy(&cc_info->clnt_cc_cv);
2263         }
2264 
2265         kmem_free(qp, sizeof (rib_qp_t));
2266 
2267         /*
2268          * If HCA has been DETACHED and the srv/clnt_conn_list is NULL,
2269          * then the hca is no longer being used.
2270          */
2271         if (conn_list != NULL) {
2272                 rw_enter(&hca->state_lock, RW_READER);
2273                 if (hca->state == HCA_DETACHED) {
2274                         rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
2275                         if (hca->srv_conn_list.conn_hd == NULL) {
2276                                 rw_enter(&hca->cl_conn_list.conn_lock,
2277                                         RW_READER);
2278                                 if (hca->cl_conn_list.conn_hd == NULL) {
2279                                         mutex_enter(&hca->inuse_lock);
2280                                         hca->inuse = FALSE;
2281                                         cv_signal(&hca->cb_cv);
2282                                         mutex_exit(&hca->inuse_lock);
2283                                 }
2284                                 rw_exit(&hca->cl_conn_list.conn_lock);
2285                         }
2286                         rw_exit(&hca->srv_conn_list.conn_lock);
2287                 }
2288                 rw_exit(&hca->state_lock);
2289         }
2290 
2291         num_clients--;
2292         return (RDMA_SUCCESS);
2293 }
2294 
2295 #ifdef DYNAMIC_CREDIT_CONTROL
2296 void rib_get_resource_info(CONN *conn, int *current_clients, int *avail_bufs)
2297 {
2298         rib_qp_t        *qp = ctoqp(conn);
2299         rib_hca_t       *hca = qp->hca;
2300         rib_bufpool_t   *rbp = NULL;
2301         bufpool_t       *bp;
2302 
2303         is_server  = 1;
2304         rbp = hca->recv_pool;
2305 
2306         if (rbp == NULL)
2307                 *avail_bufs = 0;
2308         else {
2309                 bp = rbp->bpool;
2310                 *avail_bufs = bp->buffree;
2311         }
2312 
2313         *current_clients = num_clients;
2314 }
2315 #endif
2316 
2317 /*
2318  * Wait for send completion notification. Only on receiving a
2319  * notification be it a successful or error completion, free the
2320  * send_wid.
2321  */
2322 static rdma_stat
2323 rib_sendwait(rib_qp_t *qp, struct send_wid *wd)
2324 {
2325         clock_t timout, cv_wait_ret;
2326         rdma_stat error = RDMA_SUCCESS;
2327         int     i;
2328 
2329         /*
2330          * Wait for send to complete
2331          */
2332         ASSERT(wd != NULL);
2333         mutex_enter(&wd->sendwait_lock);
2334         if (wd->status == (uint_t)SEND_WAIT) {
2335                 timout = drv_usectohz(SEND_WAIT_TIME * 1000000) +
2336                     ddi_get_lbolt();
2337                 if (qp->mode == RIB_SERVER) {
2338                         while ((cv_wait_ret = cv_timedwait(&wd->wait_cv,
2339                                     &wd->sendwait_lock, timout)) > 0 &&
2340                             wd->status == (uint_t)SEND_WAIT)
2341                                 ;
2342                         switch (cv_wait_ret) {
2343                         case -1:        /* timeout */
2344 #ifdef DEBUG
2345                                 if (rib_debug > 2)
2346                                         cmn_err(CE_WARN, "rib_sendwait: "
2347                                             "timed out qp %p\n", (void *)qp);
2348 #endif
2349                                 wd->cv_sig = 0;              /* no signal needed */
2350                                 error = RDMA_TIMEDOUT;
2351                                 break;
2352                         default:        /* got send completion */
2353                                 break;
2354                         }
2355                 } else {
2356                         while ((cv_wait_ret = cv_timedwait_sig(&wd->wait_cv,
2357                                     &wd->sendwait_lock, timout)) > 0 &&
2358                             wd->status == (uint_t)SEND_WAIT)
2359                                 ;
2360                         switch (cv_wait_ret) {
2361                         case -1:        /* timeout */
2362 #ifdef DEBUG
2363                                 if (rib_debug > 2)
2364                                         cmn_err(CE_WARN, "rib_sendwait: "
2365                                             "timed out qp %p\n", (void *)qp);
2366 #endif
2367                                 wd->cv_sig = 0;              /* no signal needed */
2368                                 error = RDMA_TIMEDOUT;
2369                                 break;
2370                         case 0:         /* interrupted */
2371 #ifdef DEBUG
2372                                 if (rib_debug > 2)
2373                                         cmn_err(CE_NOTE, "rib_sendwait:"
2374                                             " interrupted on qp %p\n",
2375                                             (void *)qp);
2376 #endif
2377                                 wd->cv_sig = 0;              /* no signal needed */
2378                                 error = RDMA_INTR;
2379                                 break;
2380                         default:        /* got send completion */
2381                                 break;
2382                         }
2383                 }
2384         }
2385 
2386         if (wd->status != (uint_t)SEND_WAIT) {
2387                 /* got send completion */
2388                 if (wd->status != RDMA_SUCCESS) {
2389                     error = wd->status;
2390                     if (wd->status != RDMA_CONNLOST)
2391                         error = RDMA_FAILED;
2392                 }
2393                 for (i = 0; i < wd->nsbufs; i++) {
2394                         rib_rbuf_free(qptoc(qp), SEND_BUFFER,
2395                                 (void *)(uintptr_t)wd->sbufaddr[i]);
2396                 }
2397                 mutex_exit(&wd->sendwait_lock);
2398                 (void) rib_free_sendwait(wd);
2399         } else {
2400                 mutex_exit(&wd->sendwait_lock);
2401         }
2402 
2403         return (error);
2404 }
2405 
2406 static struct send_wid *
2407 rib_init_sendwait(uint32_t xid, int cv_sig, rib_qp_t *qp)
2408 {
2409         struct send_wid *wd;
2410 
2411         wd = kmem_zalloc(sizeof (struct send_wid), KM_SLEEP);
2412         wd->xid = xid;
2413         wd->cv_sig = cv_sig;
2414         wd->qp = qp;
2415         cv_init(&wd->wait_cv, NULL, CV_DEFAULT, NULL);
2416         mutex_init(&wd->sendwait_lock, NULL, MUTEX_DRIVER, NULL);
2417         wd->status = (uint_t)SEND_WAIT;
2418 
2419         return (wd);
2420 }
2421 
2422 static int
2423 rib_free_sendwait(struct send_wid *wdesc)
2424 {
2425         cv_destroy(&wdesc->wait_cv);
2426         mutex_destroy(&wdesc->sendwait_lock);
2427         kmem_free(wdesc, sizeof (*wdesc));
2428 
2429         return (0);
2430 }
2431 
2432 static rdma_stat
2433 rib_rem_rep(rib_qp_t *qp, struct reply *rep)
2434 {
2435         mutex_enter(&qp->replylist_lock);
2436         if (rep != NULL) {
2437             (void) rib_remreply(qp, rep);
2438             mutex_exit(&qp->replylist_lock);
2439             return (RDMA_SUCCESS);
2440         }
2441         mutex_exit(&qp->replylist_lock);
2442         return (RDMA_FAILED);
2443 }
2444 
2445 /*
2446  * Send buffers are freed here only in case of error in posting
2447  * on QP. If the post succeeded, the send buffers are freed upon
2448  * send completion in rib_sendwait() or in the scq_handler.
2449  */
2450 rdma_stat
2451 #if defined(ASYNC_SERVER_DEREG)
2452 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid,
2453         int send_sig, int cv_sig, caddr_t c, caddr_t c1, int l1, caddr_t c2, int l2, int l3, int l4) 
2454 #else 
2455 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid, 
2456         int send_sig, int cv_sig, caddr_t *swid) 
2457 #endif 
2458 {
2459         struct send_wid *wdesc;
2460         struct clist    *clp;
2461         ibt_status_t    ibt_status = IBT_SUCCESS;
2462         rdma_stat       ret = RDMA_SUCCESS;
2463         ibt_send_wr_t   tx_wr;
2464         int             i, nds;
2465         ibt_wr_ds_t     sgl[DSEG_MAX];
2466         uint_t          total_msg_size;
2467         rib_qp_t        *qp = ctoqp(conn);
2468 
2469         ASSERT(cl != NULL);
2470 
2471         bzero(&tx_wr, sizeof (ibt_send_wr_t));
2472 
2473         nds = 0;
2474         total_msg_size = 0;
2475         clp = cl;
2476         while (clp != NULL) {
2477                 if (nds >= DSEG_MAX) {
2478                         cmn_err(CE_WARN, "rib_send_and_wait: DSEG_MAX"
2479                             " too small!");
2480                         return (RDMA_FAILED);
2481                 }
2482                 sgl[nds].ds_va = clp->c_saddr;
2483                 sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */
2484                 sgl[nds].ds_len = clp->c_len;
2485                 total_msg_size += clp->c_len;
2486                 clp = clp->c_next;
2487                 nds++;
2488         }
2489 
2490         if (send_sig) {
2491                 /* Set SEND_SIGNAL flag. */
2492                 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2493                 wdesc = rib_init_sendwait(msgid, cv_sig, qp);
2494                 *swid = (caddr_t)wdesc;
2495         } else {
2496                 tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2497                 wdesc = rib_init_sendwait(msgid, 0, qp);
2498                 *swid = (caddr_t)wdesc;
2499         }
2500         wdesc->nsbufs = nds;
2501 #if defined(ASYNC_SERVER_DEREG)
2502         wdesc->c      = c;
2503         wdesc->c1     = c1;
2504         wdesc->c2     = c2;
2505         wdesc->l1     = l1;
2506         wdesc->l2     = l2;
2507         wdesc->wl     = l3;
2508         wdesc->rl     = l4;
2509 #endif
2510         for (i = 0; i < nds; i++) {
2511                 wdesc->sbufaddr[i] = sgl[i].ds_va;
2512         }
2513 
2514         tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2515         tx_wr.wr_opcode = IBT_WRC_SEND;
2516         tx_wr.wr_trans = IBT_RC_SRV;
2517         tx_wr.wr_nds = nds;
2518         tx_wr.wr_sgl = sgl;
2519 
2520         mutex_enter(&conn->c_lock);
2521         if (conn->c_state & C_CONNECTED) {
2522                 ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2523         }
2524         if (((conn->c_state & C_CONNECTED) == 0) ||
2525                 ibt_status != IBT_SUCCESS) {
2526                 mutex_exit(&conn->c_lock);
2527                 for (i = 0; i < nds; i++) {
2528                         rib_rbuf_free(conn, SEND_BUFFER,
2529                                 (void *)(uintptr_t)wdesc->sbufaddr[i]);
2530                 }
2531                 (void) rib_free_sendwait(wdesc);
2532 #ifdef DEBUG
2533                 if (rib_debug && ibt_status != IBT_SUCCESS)
2534                         cmn_err(CE_WARN, "rib_send_and_wait: ibt_post_send "
2535                                 "failed! wr_id %llx on qpn %p, status=%d!",
2536                                 (longlong_t)tx_wr.wr_id, (void *)qp,
2537                                 ibt_status);
2538 #endif
2539                 return (RDMA_FAILED);
2540         }
2541         mutex_exit(&conn->c_lock);
2542 
2543         if (send_sig) {
2544             if (cv_sig) {
2545                 /*
2546                  * cv_wait for send to complete.
2547                  * We can fail due to a timeout or signal or
2548                  * unsuccessful send.
2549                  */
2550                 ret = rib_sendwait(qp, wdesc);
2551 #ifdef DEBUG
2552             if (rib_debug > 2)
2553                 if (ret != 0) {
2554                     cmn_err(CE_WARN, "rib_send_and_wait: rib_sendwait "
2555                         "FAILED, rdma stat=%d, wr_id %llx, qp %p!",
2556                         ret, (longlong_t)tx_wr.wr_id, (void *)qp);
2557                 }
2558 #endif
2559                 return (ret);
2560             }
2561         }
2562 
2563         return (RDMA_SUCCESS);
2564 }
2565 
2566 #if defined (CLNT_INTERRUPT_COAL)
2567 rdma_stat
2568 rib_send_bl(CONN *conn, struct clist *cl, uint32_t msgid)
2569 {
2570         rdma_stat       ret;
2571         struct send_wid *sd, dlist;     
2572         rib_qp_t *qp = ctoqp(conn);
2573         caddr_t        wd;
2574         mutex_enter(&conn->c_lock);
2575         if((conn->c_count+1) >= (preposted_rbufs/2)){
2576         conn->c_count = 0;
2577         dlist.forw = dlist.back = &dlist;
2578         while(qp->wd.forw != &qp->wd){
2579                 sd = qp->wd.forw;
2580                 remque(sd);
2581                 insque(sd,&dlist);
2582         }
2583         mutex_exit(&conn->c_lock);
2584         ret = rib_send_and_wait(conn, cl, msgid, 1, 1, &wd); 
2585         while(dlist.forw != &dlist){
2586                 sd = dlist.forw;
2587                 remque(dlist.forw);
2588                 rib_scq_free((caddr_t)sd);
2589         }
2590         }else{
2591                 mutex_exit(&conn->c_lock);
2592                 wd = 0;
2593                 ret = rib_send_and_wait(conn, cl, msgid, 0, 0, &wd); 
2594                 mutex_enter(&conn->c_lock);
2595                 conn->c_count ++;
2596                 insque(wd, &qp->wd); 
2597                 mutex_exit(&conn->c_lock);
2598         }
2599         return (ret);
2600 }
2601 #endif
2602 
2603 rdma_stat
2604 rib_send(CONN *conn, struct clist *cl, uint32_t msgid)
2605 {
2606         rdma_stat       ret;
2607         /* send-wait & cv_signal */
2608 #if defined(ASYNC_SERVER_DEREG)
2609         ret = rib_send_and_wait(conn, cl, msgid,1,1,0,0,0,0,0,0,0, &wd);
2610 #else
2611         ret = rib_send_and_wait(conn, cl, msgid, 1, 1, &wd); 
2612 #endif
2613         return (ret);
2614 }
2615 
2616 #if defined(ASYNC_SERVER_DEREG)
2617 rdma_stat
2618 rib_send_nw(CONN *conn, struct clist *cl, uint32_t msgid, caddr_t c, caddr_t c1, int c2, caddr_t c3, int c4, int c5, int c6)
2619 {
2620         rdma_stat       ret;
2621         caddr_t *wid;
2622         /* send-wait & cv_signal */
2623         ret = rib_send_and_wait(conn, cl, msgid, 1, 0, c, c1, c2, c3, c4, c5, c6, wid); 
2624 
2625         return (ret);
2626 }
2627 #endif 
2628 /*
2629  * Server interface (svc_rdma_ksend).
2630  * Send RPC reply and wait for RDMA_DONE.
2631  */
2632 rdma_stat
2633 rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid)
2634 {
2635         rdma_stat ret = RDMA_SUCCESS;
2636         struct rdma_done_list *rd;
2637         clock_t timout, cv_wait_ret;
2638         caddr_t *wid;
2639         rib_qp_t *qp = ctoqp(conn);
2640 
2641         mutex_enter(&qp->rdlist_lock);
2642         rd = rdma_done_add(qp, msgid);
2643 
2644         /* No cv_signal (whether send-wait or no-send-wait) */
2645 #if defined(ASYNC_SERVER_DEREG) 
2646         ret = rib_send_and_wait(conn, cl, msgid, 1, 0, 0, 0, 0, 0, 0, 0, 0, wid); 
2647 #else 
2648         ret = rib_send_and_wait(conn, cl, msgid, 1, 0, wid); 
2649 #endif 
2650         if (ret != RDMA_SUCCESS) {
2651 #ifdef DEBUG
2652             cmn_err(CE_WARN, "rib_send_resp: send_and_wait "
2653                 "failed, msgid %u, qp %p", msgid, (void *)qp);
2654 #endif
2655             rdma_done_rm(qp, rd);
2656             goto done;
2657         }
2658 
2659         /*
2660          * Wait for RDMA_DONE from remote end
2661          */
2662         timout = drv_usectohz(REPLY_WAIT_TIME * 1000000) + ddi_get_lbolt();
2663         cv_wait_ret = cv_timedwait(&rd->rdma_done_cv, &qp->rdlist_lock,
2664             timout);
2665         rdma_done_rm(qp, rd);
2666         if (cv_wait_ret < 0) {
2667 #ifdef DEBUG
2668                 if (rib_debug > 1) {
2669                         cmn_err(CE_WARN, "rib_send_resp: RDMA_DONE not"
2670                             " recv'd for qp %p, xid:%u\n",
2671                             (void *)qp, msgid);
2672                 }
2673 #endif
2674                 ret = RDMA_TIMEDOUT;
2675                 goto done;
2676         }
2677 
2678 done:
2679         mutex_exit(&qp->rdlist_lock);
2680         return (ret);
2681 }
2682 
2683 static struct recv_wid *
2684 rib_create_wid(rib_qp_t *qp, ibt_wr_ds_t *sgl, uint32_t msgid)
2685 {
2686         struct recv_wid *rwid;
2687 
2688         rwid = kmem_zalloc(sizeof (struct recv_wid), KM_SLEEP);
2689         rwid->xid = msgid;
2690         rwid->addr = sgl->ds_va;
2691         rwid->qp = qp;
2692 
2693         return (rwid);
2694 }
2695 
2696 static void
2697 rib_free_wid(struct recv_wid *rwid)
2698 {
2699         kmem_free(rwid, sizeof (struct recv_wid));
2700 }
2701 
2702 rdma_stat
2703 rib_clnt_post(CONN* conn, struct clist *cl, uint32_t msgid)
2704 {
2705         rib_qp_t        *qp = ctoqp(conn);
2706         struct clist    *clp = cl;
2707         struct reply    *rep;
2708         struct recv_wid *rwid;
2709         int             nds;
2710         ibt_wr_ds_t     sgl[DSEG_MAX];
2711         ibt_recv_wr_t   recv_wr;
2712         rdma_stat       ret;
2713         ibt_status_t    ibt_status;
2714 
2715         /*
2716          * rdma_clnt_postrecv uses RECV_BUFFER.
2717          */
2718 
2719         nds = 0;
2720         while (cl != NULL) {
2721                 if (nds >= DSEG_MAX) {
2722                     cmn_err(CE_WARN, "rib_clnt_post: DSEG_MAX too small!");
2723                     ret = RDMA_FAILED;
2724                     goto done;
2725                 }
2726                 sgl[nds].ds_va = cl->c_saddr;
2727                 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2728                 sgl[nds].ds_len = cl->c_len;
2729                 cl = cl->c_next;
2730                 nds++;
2731         }
2732 
2733         if (nds != 1) {
2734             cmn_err(CE_WARN, "rib_clnt_post: nds!=1\n");
2735             ret = RDMA_FAILED;
2736             goto done;
2737         }
2738         bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2739         recv_wr.wr_nds = nds;
2740         recv_wr.wr_sgl = sgl;
2741 
2742         rwid = rib_create_wid(qp, &sgl[0], msgid);
2743         if (rwid) {
2744             recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)rwid;
2745         } else {
2746                 cmn_err(CE_WARN, "rib_clnt_post: out of memory");
2747                 ret = RDMA_NORESOURCE;
2748                 goto done;
2749         }
2750         rep = rib_addreplylist(qp, msgid);
2751         if (!rep) {
2752                 cmn_err(CE_WARN, "rib_clnt_post: out of memory");
2753                 rib_free_wid(rwid);
2754                 ret = RDMA_NORESOURCE;
2755                 goto done;
2756         }
2757 
2758         mutex_enter(&conn->c_lock);
2759         if (conn->c_state & C_CONNECTED) {
2760                 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2761         }
2762         if (((conn->c_state & C_CONNECTED) == 0) ||
2763                 ibt_status != IBT_SUCCESS) {
2764                 mutex_exit(&conn->c_lock);
2765 #ifdef DEBUG
2766                 cmn_err(CE_WARN, "rib_clnt_post: QPN %p failed in "
2767                     "ibt_post_recv(), msgid=%d, status=%d",
2768                     (void *)qp,  msgid, ibt_status);
2769 #endif
2770                 rib_free_wid(rwid);
2771                 (void) rib_rem_rep(qp, rep);
2772                 ret = RDMA_FAILED;
2773                 goto done;
2774         }
2775         mutex_exit(&conn->c_lock);
2776         return (RDMA_SUCCESS);
2777 
2778 done:
2779         while (clp != NULL) {
2780             rib_rbuf_free(conn, RECV_BUFFER, (void *)(uintptr_t)clp->c_saddr);
2781             clp = clp->c_next;
2782         }
2783         return (ret);
2784 }
2785 
2786 rdma_stat
2787 rib_svc_post(CONN* conn, struct clist *cl)
2788 {
2789         rib_qp_t        *qp = ctoqp(conn);
2790         struct svc_recv *s_recvp;
2791         int             nds;
2792         ibt_wr_ds_t     sgl[DSEG_MAX];
2793         ibt_recv_wr_t   recv_wr;
2794         ibt_status_t    ibt_status;
2795 
2796         nds = 0;
2797         while (cl != NULL) {
2798                 if (nds >= DSEG_MAX) {
2799                     cmn_err(CE_WARN, "rib_svc_post: DSEG_MAX too small!");
2800                     return (RDMA_FAILED);
2801                 }
2802                 sgl[nds].ds_va = cl->c_saddr;
2803                 sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2804                 sgl[nds].ds_len = cl->c_len;
2805                 cl = cl->c_next;
2806                 nds++;
2807         }
2808 
2809         if (nds != 1) {
2810             cmn_err(CE_WARN, "rib_svc_post: nds!=1\n");
2811             rib_rbuf_free(conn, RECV_BUFFER, (caddr_t)(uintptr_t)sgl[0].ds_va);
2812             return (RDMA_FAILED);
2813         }
2814         bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2815         recv_wr.wr_nds = nds;
2816         recv_wr.wr_sgl = sgl;
2817 
2818         s_recvp = rib_init_svc_recv(qp, &sgl[0]);
2819         /* Use s_recvp's addr as wr id */
2820         recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)s_recvp;
2821         mutex_enter(&conn->c_lock);
2822         if (conn->c_state & C_CONNECTED) {
2823                 ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2824         }
2825         if (((conn->c_state & C_CONNECTED) == 0) ||
2826                 ibt_status != IBT_SUCCESS) {
2827                 mutex_exit(&conn->c_lock);
2828 #ifdef DEBUG
2829                 cmn_err(CE_WARN, "rib_svc_post: QP %p failed in "
2830                     "ibt_post_recv(), status=%d",
2831                     (void *)qp, ibt_status);
2832 #endif
2833                 rib_rbuf_free(conn, RECV_BUFFER,
2834                         (caddr_t)(uintptr_t)sgl[0].ds_va);
2835                 (void) rib_free_svc_recv(s_recvp);
2836                 return (RDMA_FAILED);
2837         }
2838         mutex_exit(&conn->c_lock);
2839 
2840         return (RDMA_SUCCESS);
2841 }
2842 
2843 /* Client */
2844 rdma_stat
2845 rib_post_resp(CONN* conn, struct clist *cl, uint32_t msgid)
2846 {
2847 
2848         return (rib_clnt_post(conn, cl, msgid));
2849 }
2850 
2851 /* Server */
2852 rdma_stat
2853 rib_post_recv(CONN *conn, struct clist *cl)
2854 {
2855         rib_qp_t        *qp = ctoqp(conn);
2856 
2857         if (rib_svc_post(conn, cl) == RDMA_SUCCESS) {
2858                 mutex_enter(&qp->posted_rbufs_lock);
2859                 qp->n_posted_rbufs++;
2860                 mutex_exit(&qp->posted_rbufs_lock);
2861                 return (RDMA_SUCCESS);
2862         }
2863         return (RDMA_FAILED);
2864 }
2865 
2866 /*
2867  * Client side only interface to "recv" the rpc reply buf
2868  * posted earlier by rib_post_resp(conn, cl, msgid).
2869  */
2870 rdma_stat
2871 rib_recv(CONN *conn, struct clist **clp, uint32_t msgid)
2872 {
2873         struct reply *rep = NULL;
2874         clock_t timout, cv_wait_ret;
2875         rdma_stat ret = RDMA_SUCCESS;
2876         rib_qp_t *qp = ctoqp(conn);
2877 
2878         /*
2879          * Find the reply structure for this msgid
2880          */
2881         mutex_enter(&qp->replylist_lock);
2882 
2883         for (rep = qp->replylist; rep != NULL; rep = rep->next) {
2884             if (rep->xid == msgid)
2885                 break;
2886         }
2887         if (rep != NULL) {
2888                 /*
2889                  * If message not yet received, wait.
2890                  */
2891                 if (rep->status == (uint_t)REPLY_WAIT) {
2892                         timout = ddi_get_lbolt() +
2893                             drv_usectohz(REPLY_WAIT_TIME * 1000000);
2894                         while ((cv_wait_ret = cv_timedwait_sig(&rep->wait_cv,
2895                                     &qp->replylist_lock, timout)) > 0 &&
2896                             rep->status == (uint_t)REPLY_WAIT);
2897 
2898                         switch (cv_wait_ret) {
2899                         case -1:        /* timeout */
2900                                 ret = RDMA_TIMEDOUT;
2901                                 break;
2902                         case 0:
2903                                 ret = RDMA_INTR;
2904                                 break;
2905                         default:
2906                                 break;
2907                         }
2908                 }
2909 
2910                 if (rep->status == RDMA_SUCCESS) {
2911                         struct clist *cl = NULL;
2912 
2913                         /*
2914                          * Got message successfully
2915                          */
2916                         clist_add(&cl, 0, rep->bytes_xfer, NULL,
2917                             (caddr_t)(uintptr_t)rep->vaddr_cq, NULL, NULL);
2918                         *clp = cl;
2919                 } else {
2920                         if (rep->status != (uint_t)REPLY_WAIT) {
2921                                 /*
2922                                  * Got error in reply message. Free
2923                                  * recv buffer here.
2924                                  */
2925                                 ret = rep->status;
2926                                 rib_rbuf_free(conn, RECV_BUFFER,
2927                                         (caddr_t)(uintptr_t)rep->vaddr_cq);
2928                         }
2929                 }
2930                 (void) rib_remreply(qp, rep);
2931         } else {
2932                 /*
2933                  * No matching reply structure found for given msgid on the
2934                  * reply wait list.
2935                  */
2936                 ret = RDMA_INVAL;
2937 #ifdef DEBUG
2938                 cmn_err(CE_WARN, "rib_recv: no matching reply for "
2939                     "xid %u, qp %p\n", msgid, (void *)qp);
2940 #endif
2941         }
2942 
2943         /*
2944          * Done.
2945          */
2946         mutex_exit(&qp->replylist_lock);
2947         return (ret);
2948 }
2949 
2950 /*
2951  * RDMA write a buffer to the remote address.
2952  */
2953 rdma_stat
2954 rib_write(CONN *conn, struct clist *cl, int wait)
2955 {
2956         ibt_send_wr_t   tx_wr;

2957         int             cv_sig;
2958         ibt_wr_ds_t     sgl[DSEG_MAX];
2959         struct send_wid *wdesc;
2960         ibt_status_t    ibt_status;
2961         rdma_stat       ret = RDMA_SUCCESS;
2962         rib_qp_t        *qp = ctoqp(conn);
2963 
2964         if (cl == NULL) {
2965                 cmn_err(CE_WARN, "rib_write: NULL clist\n");
2966                 return (RDMA_FAILED);
2967         }
2968 
2969 
2970         while ((cl != NULL)) {
2971                 if(cl->c_len > 0){
2972                 bzero(&tx_wr, sizeof (ibt_send_wr_t));



2973                 tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->c_daddr;
2974                 tx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_dmemhandle.mrc_rmr; /* rkey */
2975                 sgl[0].ds_va = cl->c_saddr;
2976                 sgl[0].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2977                 sgl[0].ds_len = cl->c_len;
2978 













2979         if (wait) {
2980                 tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2981                 cv_sig = 1;
2982         } else {
2983                 tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2984                 cv_sig = 0;
2985         }
2986 
2987         wdesc = rib_init_sendwait(0, cv_sig, qp);
2988         tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2989         tx_wr.wr_opcode = IBT_WRC_RDMAW;
2990         tx_wr.wr_trans = IBT_RC_SRV;
2991         tx_wr.wr_nds = 1; 
2992         tx_wr.wr_sgl = sgl;
2993 
2994         mutex_enter(&conn->c_lock);
2995         if (conn->c_state & C_CONNECTED) {
2996                 ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2997         }
2998         if (((conn->c_state & C_CONNECTED) == 0) ||
2999                 ibt_status != IBT_SUCCESS) {
3000                 mutex_exit(&conn->c_lock);
3001                 (void) rib_free_sendwait(wdesc);
3002                 return (RDMA_FAILED);
3003         }
3004         mutex_exit(&conn->c_lock);
3005 
3006         /*
3007          * Wait for send to complete
3008          */
3009         if (wait) {
3010                 ret = rib_sendwait(qp, wdesc);
3011                 if (ret != 0) {
3012                         return (ret);
3013                 }
3014         }
3015         }
3016                 cl = cl->c_next;
3017         }
3018         return (RDMA_SUCCESS);
3019 }
3020 
3021 /*
3022  * RDMA Read a buffer from the remote address.
3023  */
3024 rdma_stat
3025 rib_read(CONN *conn, struct clist *cl, int wait)
3026 {
3027         ibt_send_wr_t   rx_wr;
3028         int             nds;
3029         int             cv_sig;
3030         ibt_wr_ds_t     sgl[DSEG_MAX];  /* is 2 sufficient? */
3031         struct send_wid *wdesc;
3032         ibt_status_t    ibt_status = IBT_SUCCESS;
3033         rdma_stat       ret = RDMA_SUCCESS;
3034         rib_qp_t        *qp = ctoqp(conn);
3035 
3036         if (cl == NULL) {
3037                 cmn_err(CE_WARN, "rib_read: NULL clist\n");
3038                 return (RDMA_FAILED);
3039         }
3040 
3041         bzero(&rx_wr, sizeof (ibt_send_wr_t));
3042         /*
3043          * Remote address is at the head chunk item in list.
3044          */
3045         rx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->c_saddr;
3046         rx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_smemhandle.mrc_rmr; /* rkey */
3047 
3048         nds = 0;
3049         while (cl != NULL) {
3050                 if (nds >= DSEG_MAX) {
3051                         cmn_err(CE_WARN, "rib_read: DSEG_MAX too small!");
3052                         return (RDMA_FAILED);
3053                 }
3054                 sgl[nds].ds_va = cl->c_daddr;
3055                 sgl[nds].ds_key = cl->c_dmemhandle.mrc_lmr; /* lkey */
3056                 sgl[nds].ds_len = cl->c_len;
3057                 cl = cl->c_next;
3058                 nds++;
3059         }
3060 
3061         if (wait) {
3062                 rx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
3063                 cv_sig = 1;
3064         } else {
3065                 rx_wr.wr_flags = IBT_WR_NO_FLAGS;
3066                 cv_sig = 0;
3067         }
3068 
3069         wdesc = rib_init_sendwait(0, cv_sig, qp);
3070         rx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
3071         rx_wr.wr_opcode = IBT_WRC_RDMAR;
3072         rx_wr.wr_trans = IBT_RC_SRV;
3073         rx_wr.wr_nds = nds;
3074         rx_wr.wr_sgl = sgl;
3075 
3076         mutex_enter(&conn->c_lock);
3077         if (conn->c_state & C_CONNECTED) {
3078                 ibt_status = ibt_post_send(qp->qp_hdl, &rx_wr, 1, NULL);
3079         }
3080         if (((conn->c_state & C_CONNECTED) == 0) ||
3081                 ibt_status != IBT_SUCCESS) {
3082                 mutex_exit(&conn->c_lock);
3083 #ifdef DEBUG
3084                 if (rib_debug && ibt_status != IBT_SUCCESS)
3085                         cmn_err(CE_WARN, "rib_read: FAILED post_sending RDMAR"
3086                                 " wr_id %llx on qp %p, status=%d",
3087                                 (longlong_t)rx_wr.wr_id, (void *)qp,
3088                                 ibt_status);
3089 #endif
3090                 (void) rib_free_sendwait(wdesc);
3091                 return (RDMA_FAILED);
3092         }
3093         mutex_exit(&conn->c_lock);
3094 
3095         /*
3096          * Wait for send to complete
3097          */
3098         if (wait) {
3099                 ret = rib_sendwait(qp, wdesc);
3100                 if (ret != 0) {
3101                         return (ret);
3102                 }
3103         }
3104 
3105         return (RDMA_SUCCESS);
3106 }
3107 
3108 int
3109 is_for_ipv4(ibt_ar_t *result)
3110 {
3111         int     i, size = sizeof (struct in_addr);
3112         uint8_t zero = 0;
3113 
3114         for (i = 0; i < (ATS_AR_DATA_LEN - size); i++)
3115                 zero |= result->ar_data[i];
3116         return (zero == 0);
3117 }
3118 
3119 /*
3120  * rib_srv_cm_handler()
3121  *    Connection Manager callback to handle RC connection requests.
3122  */
3123 /* ARGSUSED */
3124 static ibt_cm_status_t
3125 rib_srv_cm_handler(void *any, ibt_cm_event_t *event,
3126         ibt_cm_return_args_t *ret_args, void *priv_data,
3127         ibt_priv_data_len_t len)
3128 {
3129         queue_t         *q;
3130         rib_qp_t        *qp;
3131         rpcib_state_t   *ribstat;
3132         rib_hca_t       *hca;
3133         rdma_stat       status = RDMA_SUCCESS;
3134         int             i;
3135         struct clist    cl;
3136         rdma_buf_t      rdbuf = {0}; 
3137         void            *buf = NULL;
3138         ibt_cm_req_rcv_t        cm_req_rcv;
3139         CONN            *conn;
3140         ibt_status_t ibt_status;
3141         ibt_ar_t        ar_query, ar_result;
3142         ib_gid_t        sgid;
3143 
3144 
3145         ASSERT(any != NULL);
3146         ASSERT(event != NULL);
3147 
3148         ribstat = (rpcib_state_t *)any;
3149         hca = (rib_hca_t *)ribstat->hca;
3150         ASSERT(hca != NULL);
3151 
3152         /* got a connection request */
3153         switch (event->cm_type) {
3154         case IBT_CM_EVENT_REQ_RCV:
3155                 /*
3156                  * If the plugin is in the NO_ACCEPT state, bail out.
3157                  */
3158                 mutex_enter(&plugin_state_lock);
3159                 if (plugin_state == NO_ACCEPT) {
3160                         mutex_exit(&plugin_state_lock);
3161                         return (IBT_CM_REJECT);
3162                 }
3163                 mutex_exit(&plugin_state_lock);
3164 
3165                 /*
3166                  * Need to send a MRA MAD to CM so that it does not
3167                  * timeout on us.
3168                  */
3169                 (void) ibt_cm_delay(IBT_CM_DELAY_REQ, event->cm_session_id,
3170                             event->cm_event.req.req_timeout * 8, NULL, 0);
3171 
3172                 mutex_enter(&rib_stat->open_hca_lock);
3173                 q = rib_stat->q;
3174                 mutex_exit(&rib_stat->open_hca_lock);
3175                 status = rib_svc_create_chan(hca, (caddr_t)q,
3176                         event->cm_event.req.req_prim_hca_port, &qp);
3177                 if (status) {
3178 #ifdef DEBUG
3179                         cmn_err(CE_WARN, "rib_srv_cm_handler: "
3180                             "create_channel failed %d", status);
3181 #endif
3182                         return (IBT_CM_REJECT);
3183                 }
3184                 cm_req_rcv = event->cm_event.req;
3185 
3186 #ifdef DEBUG
3187                 if (rib_debug > 2) {
3188                     cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3189                         "server recv'ed IBT_CM_EVENT_REQ_RCV\n");
3190                     cmn_err(CE_NOTE, "\t\t SID:%llx\n",
3191                                 (longlong_t)cm_req_rcv.req_service_id);
3192                     cmn_err(CE_NOTE, "\t\t Local Port:%d\n",
3193                                 cm_req_rcv.req_prim_hca_port);
3194                     cmn_err(CE_NOTE,
3195                         "\t\t Remote GID:(prefix:%llx,guid:%llx)\n",
3196                         (longlong_t)cm_req_rcv.req_prim_addr.av_dgid.gid_prefix,
3197                         (longlong_t)cm_req_rcv.req_prim_addr.av_dgid.gid_guid);
3198                     cmn_err(CE_NOTE, "\t\t Local GID:(prefix:%llx,guid:%llx)\n",
3199                         (longlong_t)cm_req_rcv.req_prim_addr.av_sgid.gid_prefix,
3200                         (longlong_t)cm_req_rcv.req_prim_addr.av_sgid.gid_guid);
3201                     cmn_err(CE_NOTE, "\t\t Remote QPN:%u\n",
3202                         cm_req_rcv.req_remote_qpn);
3203                     cmn_err(CE_NOTE, "\t\t Remote Q_Key:%x\n",
3204                         cm_req_rcv.req_remote_qkey);
3205                     cmn_err(CE_NOTE, "\t\t Local QP %p (qp_hdl=%p)\n",
3206                         (void *)qp, (void *)qp->qp_hdl);
3207                 }
3208 
3209                 if (rib_debug > 2) {
3210                     ibt_rc_chan_query_attr_t    chan_attrs;
3211 
3212                     if (ibt_query_rc_channel(qp->qp_hdl, &chan_attrs)
3213                         == IBT_SUCCESS) {
3214                         cmn_err(CE_NOTE, "rib_svc_cm_handler: qp %p in "
3215                             "CEP state %d\n", (void *)qp, chan_attrs.rc_state);
3216                     }
3217                 }
3218 #endif
3219 
3220                 ret_args->cm_ret.rep.cm_channel = qp->qp_hdl;
3221                 ret_args->cm_ret.rep.cm_rdma_ra_out = 4; 
3222                 ret_args->cm_ret.rep.cm_rdma_ra_in = 4; 
3223                 ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES;
3224 
3225                 /*
3226                  * Pre-posts RECV buffers
3227                  */
3228                 conn = qptoc(qp);
3229                 for (i = 0; i < preposted_rbufs; i++) {
3230                     bzero(&rdbuf, sizeof (rdbuf));
3231                     rdbuf.type = RECV_BUFFER;
3232                     buf = rib_rbuf_alloc(conn, &rdbuf);
3233                     if (buf == NULL) {
3234                         cmn_err(CE_WARN, "rib_svc_cm_handler: "
3235                             "No RECV_BUFFER buf!\n");
3236                         (void) rib_disconnect_channel(conn, NULL);
3237                         return (IBT_CM_REJECT);
3238                     }
3239 
3240                     bzero(&cl, sizeof (cl));
3241                     cl.c_saddr = (uintptr_t)rdbuf.addr;
3242                     cl.c_len = rdbuf.len;
3243                     cl.c_smemhandle.mrc_lmr = rdbuf.handle.mrc_lmr; /* lkey */
3244                     cl.c_next = NULL;
3245                     status = rib_post_recv(conn, &cl);
3246                     if (status != RDMA_SUCCESS) {
3247                         cmn_err(CE_WARN, "rib_srv_cm_handler: failed "
3248                             "posting RPC_REQ buf to qp %p!", (void *)qp);
3249                         (void) rib_disconnect_channel(conn, NULL);
3250                         return (IBT_CM_REJECT);
3251                     }
3252                 }
3253                 (void) rib_add_connlist(conn, &hca->srv_conn_list);
3254 
3255                 /*
3256                  * Get the address translation service record from ATS
3257                  */
3258                 rw_enter(&hca->state_lock, RW_READER);
3259                 if (hca->state == HCA_DETACHED) {
3260                     rw_exit(&hca->state_lock);
3261                     return (IBT_CM_REJECT);
3262                 }
3263                 rw_exit(&hca->state_lock);
3264 
3265                 for (i = 0; i < hca->hca_nports; i++) {
3266                     ibt_status = ibt_get_port_state(hca->hca_hdl, i+1,
3267                                         &sgid, NULL);
3268                     if (ibt_status != IBT_SUCCESS) {
3269                         if (rib_debug) {
3270                             cmn_err(CE_WARN, "rib_srv_cm_handler: "
3271                                 "ibt_get_port_state FAILED!"
3272                                 "status = %d\n", ibt_status);
3273                         }
3274                     } else {
3275                         /*
3276                          * do ibt_query_ar()
3277                          */
3278                         bzero(&ar_query, sizeof (ar_query));
3279                         bzero(&ar_result, sizeof (ar_result));
3280                         ar_query.ar_gid = cm_req_rcv.req_prim_addr.av_dgid;
3281                         ar_query.ar_pkey = event->cm_event.req.req_pkey;
3282                         ibt_status = ibt_query_ar(&sgid, &ar_query,
3283                                                         &ar_result);
3284                         if (ibt_status != IBT_SUCCESS) {
3285                             if (rib_debug) {
3286                                 cmn_err(CE_WARN, "rib_srv_cm_handler: "
3287                                     "ibt_query_ar FAILED!"
3288                                     "status = %d\n", ibt_status);
3289                             }
3290                         } else {
3291                             conn = qptoc(qp);
3292 
3293                             if (is_for_ipv4(&ar_result)) {
3294                                 struct sockaddr_in *s;
3295                                 int sin_size = sizeof (struct sockaddr_in);
3296                                 int in_size = sizeof (struct in_addr);
3297                                 uint8_t *start_pos;
3298 
3299                                 conn->c_raddr.maxlen =
3300                                         conn->c_raddr.len = sin_size;
3301                                 conn->c_raddr.buf = kmem_zalloc(sin_size,
3302                                                 KM_SLEEP);
3303                                 s = (struct sockaddr_in *)conn->c_raddr.buf;
3304                                 s->sin_family = AF_INET;
3305                                 /*
3306                                  * For IPv4,  the IP addr is stored in
3307                                  * the last four bytes of ar_data.
3308                                  */
3309                                 start_pos = ar_result.ar_data +
3310                                         ATS_AR_DATA_LEN - in_size;
3311                                 bcopy(start_pos, &s->sin_addr, in_size);
3312                                 if (rib_debug > 1) {
3313                                     char print_addr[INET_ADDRSTRLEN];
3314 
3315                                     bzero(print_addr, INET_ADDRSTRLEN);
3316                                     (void) inet_ntop(AF_INET, &s->sin_addr,
3317                                                 print_addr, INET_ADDRSTRLEN);
3318                                     cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3319                                         "remote clnt_addr: %s\n", print_addr);
3320                                 }
3321                             } else {
3322                                 struct sockaddr_in6 *s6;
3323                                 int sin6_size = sizeof (struct sockaddr_in6);
3324 
3325                                 conn->c_raddr.maxlen =
3326                                         conn->c_raddr.len = sin6_size;
3327                                 conn->c_raddr.buf = kmem_zalloc(sin6_size,
3328                                         KM_SLEEP);
3329 
3330                                 s6 = (struct sockaddr_in6 *)conn->c_raddr.buf;
3331                                 s6->sin6_family = AF_INET6;
3332                                 /* sin6_addr is stored in ar_data */
3333                                 bcopy(ar_result.ar_data, &s6->sin6_addr,
3334                                         sizeof (struct in6_addr));
3335                                 if (rib_debug > 1) {
3336                                     char print_addr[INET6_ADDRSTRLEN];
3337 
3338                                     bzero(print_addr, INET6_ADDRSTRLEN);
3339                                     (void) inet_ntop(AF_INET6, &s6->sin6_addr,
3340                                                 print_addr, INET6_ADDRSTRLEN);
3341                                     cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3342                                         "remote clnt_addr: %s\n", print_addr);
3343                                 }
3344                             }
3345                             return (IBT_CM_ACCEPT);
3346                         }
3347                     }
3348                 }
3349                 if (rib_debug > 1) {
3350                     cmn_err(CE_WARN, "rib_srv_cm_handler: "
3351                                 "address record query failed!");
3352                 }
3353                 break;
3354 
3355         case IBT_CM_EVENT_CONN_CLOSED:
3356         {
3357                 CONN            *conn;
3358                 rib_qp_t        *qp;
3359 
3360                 switch (event->cm_event.closed) {
3361                 case IBT_CM_CLOSED_DREP_RCVD:
3362                 case IBT_CM_CLOSED_DREQ_TIMEOUT:
3363                 case IBT_CM_CLOSED_DUP:
3364                 case IBT_CM_CLOSED_ABORT:
3365                 case IBT_CM_CLOSED_ALREADY:
3366                         /*
3367                          * These cases indicate the local end initiated
3368                          * the closing of the channel. Nothing to do here.
3369                          */
3370                         break;
3371                 default:
3372                         /*
3373                          * Reason for CONN_CLOSED event must be one of
3374                          * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
3375                          * or IBT_CM_CLOSED_STALE. These indicate cases were
3376                          * the remote end is closing the channel. In these
3377                          * cases free the channel and transition to error
3378                          * state
3379                          */
3380                         qp = ibt_get_chan_private(event->cm_channel);
3381                         conn = qptoc(qp);
3382                         mutex_enter(&conn->c_lock);
3383                         if (conn->c_state == C_DISCONN_PEND) {
3384                                 mutex_exit(&conn->c_lock);
3385                                 break;
3386                         }
3387                         conn->c_state = C_ERROR;
3388 
3389                         /*
3390                          * Free the rc_channel. Channel has already
3391                          * transitioned to ERROR state and WRs have been
3392                          * FLUSHED_ERR already.
3393                          */
3394                         (void) ibt_free_channel(qp->qp_hdl);
3395                         qp->qp_hdl = NULL;
3396 
3397                         /*
3398                          * Free the conn if c_ref goes down to 0
3399                          */
3400                         if (conn->c_ref == 0) {
3401                                 /*
3402                                  * Remove from list and free conn
3403                                  */
3404                                 conn->c_state = C_DISCONN_PEND;
3405                                 mutex_exit(&conn->c_lock);
3406                                 (void) rib_disconnect_channel(conn,
3407                                         &hca->srv_conn_list);
3408                         } else {
3409                                 mutex_exit(&conn->c_lock);
3410                         }
3411 #ifdef DEBUG
3412                         if (rib_debug)
3413                                 cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3414                                         " (CONN_CLOSED) channel disconnected");
3415 #endif
3416                         break;
3417                 }
3418                 break;
3419         }
3420         case IBT_CM_EVENT_CONN_EST:
3421         /*
3422          * RTU received, hence connection established.
3423          */
3424                 if (rib_debug > 1)
3425                         cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3426                                 "(CONN_EST) channel established");
3427                 break;
3428 
3429         default:
3430             if (rib_debug > 2) {
3431                 /* Let CM handle the following events. */
3432                 if (event->cm_type == IBT_CM_EVENT_REP_RCV) {
3433                         cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3434                             "server recv'ed IBT_CM_EVENT_REP_RCV\n");
3435                 } else if (event->cm_type == IBT_CM_EVENT_LAP_RCV) {
3436                         cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3437                             "server recv'ed IBT_CM_EVENT_LAP_RCV\n");
3438                 } else if (event->cm_type == IBT_CM_EVENT_MRA_RCV) {
3439                         cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3440                             "server recv'ed IBT_CM_EVENT_MRA_RCV\n");
3441                 } else if (event->cm_type == IBT_CM_EVENT_APR_RCV) {
3442                         cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3443                             "server recv'ed IBT_CM_EVENT_APR_RCV\n");
3444                 } else if (event->cm_type == IBT_CM_EVENT_FAILURE) {
3445                         cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3446                             "server recv'ed IBT_CM_EVENT_FAILURE\n");
3447                 }
3448             }
3449             return (IBT_CM_REJECT);
3450         }
3451 
3452         /* accept all other CM messages (i.e. let the CM handle them) */
3453         return (IBT_CM_ACCEPT);
3454 }
3455 
3456 static rdma_stat
3457 rib_register_ats(rib_hca_t *hca)
3458 {
3459         ibt_hca_portinfo_t      *port_infop;
3460         uint_t                  port_size;
3461         uint_t                  pki, i, num_ports, nbinds;
3462         ibt_status_t            ibt_status;
3463         rib_service_t           *new_service, *temp_srv;
3464         rpcib_ats_t             *atsp;
3465         rpcib_ibd_insts_t       ibds;
3466         ib_pkey_t               pkey;
3467         ibt_ar_t                ar;     /* address record */
3468 
3469         /*
3470          * Query all ports for the given HCA
3471          */
3472         rw_enter(&hca->state_lock, RW_READER);
3473         if (hca->state != HCA_DETACHED) {
3474                 ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop,
3475                     &num_ports, &port_size);
3476                 rw_exit(&hca->state_lock);
3477         } else {
3478                 rw_exit(&hca->state_lock);
3479                 return (RDMA_FAILED);
3480         }
3481         if (ibt_status != IBT_SUCCESS) {
3482 #ifdef DEBUG
3483             if (rib_debug) {
3484                 cmn_err(CE_NOTE, "rib_register_ats: FAILED in "
3485                     "ibt_query_hca_ports, status = %d\n", ibt_status);
3486             }
3487 #endif
3488                 return (RDMA_FAILED);
3489         }
3490 
3491 #ifdef  DEBUG
3492         if (rib_debug > 1) {
3493                 cmn_err(CE_NOTE, "rib_register_ats: Ports detected "
3494                     "%d\n", num_ports);
3495 
3496                 for (i = 0; i < num_ports; i++) {
3497                         if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) {
3498                                 cmn_err(CE_WARN, "rib_register_ats "
3499                                     "Port #: %d INACTIVE\n", i+1);
3500                         } else if (port_infop[i].p_linkstate ==
3501                             IBT_PORT_ACTIVE) {
3502                                 cmn_err(CE_NOTE, "rib_register_ats "
3503                                     "Port #: %d ACTIVE\n", i+1);
3504                         }
3505                 }
3506         }
3507 #endif
3508 
3509         ibds.rib_ibd_alloc = N_IBD_INSTANCES;
3510         ibds.rib_ibd_cnt = 0;
3511         ibds.rib_ats = (rpcib_ats_t *)kmem_zalloc(ibds.rib_ibd_alloc *
3512                         sizeof (rpcib_ats_t), KM_SLEEP);
3513         rib_get_ibd_insts(&ibds);
3514 
3515         if (ibds.rib_ibd_cnt == 0) {
3516             kmem_free(ibds.rib_ats, ibds.rib_ibd_alloc *
3517                                 sizeof (rpcib_ats_t));
3518             ibt_free_portinfo(port_infop, port_size);
3519             return (RDMA_FAILED);
3520         }
3521 
3522         /*
3523          * Get the IP addresses of active ports and
3524          * register them with ATS.  IPv4 addresses
3525          * have precedence over IPv6 addresses.
3526          */
3527         if (get_ibd_ipaddr(&ibds) != 0) {
3528 #ifdef  DEBUG
3529             if (rib_debug > 1) {
3530                 cmn_err(CE_WARN, "rib_register_ats: "
3531                     "get_ibd_ipaddr failed");
3532             }
3533 #endif
3534             kmem_free(ibds.rib_ats, ibds.rib_ibd_alloc *
3535                                 sizeof (rpcib_ats_t));
3536             ibt_free_portinfo(port_infop, port_size);
3537             return (RDMA_FAILED);
3538         }
3539 
3540         /*
3541          * Start ATS registration for active ports on this HCA.
3542          */
3543         rw_enter(&hca->service_list_lock, RW_WRITER);
3544         nbinds = 0;
3545         new_service = NULL;
3546         for (i = 0; i < num_ports; i++) {
3547                 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE)
3548                         continue;
3549 
3550             for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) {
3551                 pkey = port_infop[i].p_pkey_tbl[pki];
3552                 if ((pkey & IBSRM_HB) && (pkey != IB_PKEY_INVALID_FULL)) {
3553                     ar.ar_gid = port_infop[i].p_sgid_tbl[0];
3554                     ar.ar_pkey = pkey;
3555                     atsp = get_ibd_entry(&ar.ar_gid, pkey, &ibds);
3556                     if (atsp == NULL)
3557                         continue;
3558                 /*
3559                  * store the sin[6]_addr in ar_data
3560                  */
3561                     (void) bzero(ar.ar_data, ATS_AR_DATA_LEN);
3562                     if (atsp->ras_inet_type == AF_INET) {
3563                         uint8_t *start_pos;
3564 
3565                         /*
3566                          * The ipv4 addr goes into the last
3567                          * four bytes of ar_data.
3568                          */
3569                         start_pos = ar.ar_data + ATS_AR_DATA_LEN -
3570                                 sizeof (struct in_addr);
3571                         bcopy(&atsp->ras_sin.sin_addr, start_pos,
3572                                 sizeof (struct in_addr));
3573                     } else if (atsp->ras_inet_type == AF_INET6) {
3574                         bcopy(&atsp->ras_sin6.sin6_addr, ar.ar_data,
3575                                 sizeof (struct in6_addr));
3576                     } else
3577                         continue;
3578 
3579                     ibt_status = ibt_register_ar(hca->ibt_clnt_hdl, &ar);
3580                     if (ibt_status == IBT_SUCCESS) {
3581 #ifdef  DEBUG
3582                         if (rib_debug > 1) {
3583                                 cmn_err(CE_WARN, "rib_register_ats: "
3584                                     "ibt_register_ar OK on port %d", i+1);
3585                         }
3586 #endif
3587                         /*
3588                          * Allocate and prepare a service entry
3589                          */
3590                         new_service = kmem_zalloc(sizeof (rib_service_t),
3591                                 KM_SLEEP);
3592                         new_service->srv_port = i + 1;
3593                         new_service->srv_ar = ar;
3594                         new_service->srv_next = NULL;
3595 
3596                         /*
3597                          * Add to the service list for this HCA
3598                          */
3599                         new_service->srv_next = hca->ats_list;
3600                         hca->ats_list = new_service;
3601                         new_service = NULL;
3602                         nbinds ++;
3603                     } else {
3604 #ifdef  DEBUG
3605                         if (rib_debug > 1) {
3606                             cmn_err(CE_WARN, "rib_register_ats: "
3607                             "ibt_register_ar FAILED on port %d", i+1);
3608                         }
3609 #endif
3610                     }
3611                 }
3612             }
3613         }
3614 
3615 #ifdef  DEBUG
3616         if (rib_debug > 1) {
3617                 for (temp_srv = hca->ats_list; temp_srv != NULL;
3618                         temp_srv = temp_srv->srv_next) {
3619                                 cmn_err(CE_NOTE, "Service: ATS, active on"
3620                                         " port: %d\n", temp_srv->srv_port);
3621                 }
3622         }
3623 #endif
3624 
3625         rw_exit(&hca->service_list_lock);
3626         kmem_free(ibds.rib_ats, ibds.rib_ibd_alloc * sizeof (rpcib_ats_t));
3627         ibt_free_portinfo(port_infop, port_size);
3628 
3629         if (nbinds == 0) {
3630 #ifdef  DEBUG
3631         if (rib_debug > 1) {
3632                 cmn_err(CE_WARN, "rib_register_ats FAILED!\n");
3633         }
3634 #endif
3635                 return (RDMA_FAILED);
3636         }
3637         return (RDMA_SUCCESS);
3638 }
3639 
3640 static rdma_stat
3641 rib_register_service(rib_hca_t *hca, int service_type)
3642 {
3643         ibt_srv_desc_t          sdesc;
3644         ibt_srv_bind_t          sbind;
3645         ibt_hca_portinfo_t      *port_infop;
3646         ib_svc_id_t             srv_id;
3647         ibt_srv_hdl_t           srv_hdl;
3648         uint_t                  port_size;
3649         uint_t                  pki, i, j, num_ports, nbinds;
3650         ibt_status_t            ibt_status;
3651         char                    **addrs;
3652         int                     addr_count;
3653         rib_service_t           *new_service, *temp_srv;
3654         ib_pkey_t               pkey;
3655 
3656         /*
3657          * Query all ports for the given HCA
3658          */
3659         rw_enter(&hca->state_lock, RW_READER);
3660         if (hca->state != HCA_DETACHED) {
3661                 ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop,
3662                     &num_ports, &port_size);
3663                 rw_exit(&hca->state_lock);
3664         } else {
3665                 rw_exit(&hca->state_lock);
3666                 return (RDMA_FAILED);
3667         }
3668         if (ibt_status != IBT_SUCCESS) {
3669 #ifdef DEBUG
3670                 cmn_err(CE_NOTE, "rib_register_service: FAILED in "
3671                     "ibt_query_hca_ports, status = %d\n", ibt_status);
3672 #endif
3673                 return (RDMA_FAILED);
3674         }
3675 
3676 #ifdef  DEBUG
3677         if (rib_debug > 1) {
3678                 cmn_err(CE_NOTE, "rib_register_service: Ports detected "
3679                     "%d\n", num_ports);
3680 
3681                 for (i = 0; i < num_ports; i++) {
3682                         if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) {
3683                                 cmn_err(CE_WARN, "rib_register_service "
3684                                     "Port #: %d INACTIVE\n", i+1);
3685                         } else if (port_infop[i].p_linkstate ==
3686                             IBT_PORT_ACTIVE) {
3687                                 cmn_err(CE_NOTE, "rib_register_service "
3688                                     "Port #: %d ACTIVE\n", i+1);
3689                         }
3690                 }
3691         }
3692 #endif
3693         /*
3694          * Get all the IP addresses on this system to register the
3695          * given "service type" on all DNS recognized IP addrs.
3696          * Each service type such as NFS will have all the systems
3697          * IP addresses as its different names. For now the only
3698          * type of service we support in RPCIB is NFS.
3699          */
3700         addrs = get_ip_addrs(&addr_count);
3701         if (addrs == NULL) {
3702 #ifdef DEBUG
3703                 if (rib_debug) {
3704                     cmn_err(CE_WARN, "rib_register_service: "
3705                         "get_ip_addrs failed\n");
3706                 }
3707 #endif
3708                 ibt_free_portinfo(port_infop, port_size);
3709                 return (RDMA_FAILED);
3710         }
3711 
3712 #ifdef  DEBUG
3713         if (rib_debug > 1) {
3714                 for (i = 0; i < addr_count; i++)
3715                         cmn_err(CE_NOTE, "addr %d: %s\n", i, addrs[i]);
3716         }
3717 #endif
3718 
3719         rw_enter(&hca->service_list_lock, RW_WRITER);
3720         /*
3721          * Start registering and binding service to active
3722          * on active ports on this HCA.
3723          */
3724         nbinds = 0;
3725         new_service = NULL;
3726 
3727         /*
3728          * We use IP addresses as the service names for
3729          * service registration.  Register each of them
3730          * with CM to obtain a svc_id and svc_hdl.  We do not
3731          * register the service with machine's loopback address.
3732          */
3733         for (j = 1; j < addr_count; j++) {
3734             (void) bzero(&srv_id, sizeof (ib_svc_id_t));
3735             (void) bzero(&srv_hdl, sizeof (ibt_srv_hdl_t));
3736             (void) bzero(&sdesc, sizeof (ibt_srv_desc_t));
3737 
3738             sdesc.sd_handler = rib_srv_cm_handler;
3739             sdesc.sd_flags = 0;
3740 
3741             ibt_status = ibt_register_service(hca->ibt_clnt_hdl,
3742                             &sdesc, 0, 1, &srv_hdl, &srv_id);
3743             if (ibt_status != IBT_SUCCESS) {
3744 #ifdef DEBUG
3745                 if (rib_debug) {
3746                     cmn_err(CE_WARN, "rib_register_service: "
3747                         "ibt_register_service FAILED, status "
3748                         "= %d\n", ibt_status);
3749                 }
3750 #endif
3751                 /*
3752                  * No need to go on, since we failed to obtain
3753                  * a srv_id and srv_hdl. Move on to the next
3754                  * IP addr as a service name.
3755                  */
3756                 continue;
3757             }
3758             for (i = 0; i < num_ports; i++) {
3759                 if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE)
3760                         continue;
3761 
3762                 for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) {
3763                     pkey = port_infop[i].p_pkey_tbl[pki];
3764                     if ((pkey & IBSRM_HB) && (pkey != IB_PKEY_INVALID_FULL)) {
3765 
3766                         /*
3767                          * Allocate and prepare a service entry
3768                          */
3769                         new_service = kmem_zalloc(1 * sizeof (rib_service_t),
3770                             KM_SLEEP);
3771                         new_service->srv_type = service_type;
3772                         new_service->srv_port = i + 1;
3773                         new_service->srv_id = srv_id;
3774                         new_service->srv_hdl = srv_hdl;
3775                         new_service->srv_sbind_hdl = kmem_zalloc(1 *
3776                             sizeof (ibt_sbind_hdl_t), KM_SLEEP);
3777 
3778                         new_service->srv_name = kmem_zalloc(IB_SVC_NAME_LEN,
3779                             KM_SLEEP);
3780                         (void) bcopy(addrs[j], new_service->srv_name,
3781                             IB_SVC_NAME_LEN);
3782                         (void) strlcat(new_service->srv_name, "::NFS",
3783                                 IB_SVC_NAME_LEN);
3784                         new_service->srv_next = NULL;
3785 
3786                         /*
3787                          * Bind the service, specified by the IP address,
3788                          * to the port/pkey using the srv_hdl returned
3789                          * from ibt_register_service().
3790                          */
3791                         (void) bzero(&sbind, sizeof (ibt_srv_bind_t));
3792                         sbind.sb_pkey = pkey;
3793                         sbind.sb_lease = 0xFFFFFFFF;
3794                         sbind.sb_key[0] = NFS_SEC_KEY0;
3795                         sbind.sb_key[1] = NFS_SEC_KEY1;
3796                         sbind.sb_name = new_service->srv_name;
3797 
3798 #ifdef  DEBUG
3799                         if (rib_debug > 1) {
3800                                 cmn_err(CE_NOTE, "rib_register_service: "
3801                                     "binding service using name: %s\n",
3802                                     sbind.sb_name);
3803                         }
3804 #endif
3805                         ibt_status = ibt_bind_service(srv_hdl,
3806                             port_infop[i].p_sgid_tbl[0], &sbind, rib_stat,
3807                             new_service->srv_sbind_hdl);
3808                         if (ibt_status != IBT_SUCCESS) {
3809 #ifdef  DEBUG
3810                             if (rib_debug) {
3811                                 cmn_err(CE_WARN, "rib_register_service: FAILED"
3812                                     " in ibt_bind_service, status = %d\n",
3813                                     ibt_status);
3814                             }
3815 #endif
3816                                 kmem_free(new_service->srv_sbind_hdl,
3817                                     sizeof (ibt_sbind_hdl_t));
3818                                 kmem_free(new_service->srv_name,
3819                                     IB_SVC_NAME_LEN);
3820                                 kmem_free(new_service,
3821                                     sizeof (rib_service_t));
3822                                 new_service = NULL;
3823                                 continue;
3824                         }
3825 #ifdef  DEBUG
3826                         if (rib_debug > 1) {
3827                                 if (ibt_status == IBT_SUCCESS)
3828                                         cmn_err(CE_NOTE, "rib_regstr_service: "
3829                                             "Serv: %s REGISTERED on port: %d",
3830                                             sbind.sb_name, i+1);
3831                         }
3832 #endif
3833                         /*
3834                          * Add to the service list for this HCA
3835                          */
3836                         new_service->srv_next = hca->service_list;
3837                         hca->service_list = new_service;
3838                         new_service = NULL;
3839                         nbinds ++;
3840                     }
3841                 }
3842             }
3843         }
3844         rw_exit(&hca->service_list_lock);
3845 
3846 #ifdef  DEBUG
3847         if (rib_debug > 1) {
3848                 /*
3849                  * Change this print to a more generic one, as rpcib
3850                  * is supposed to handle multiple service types.
3851                  */
3852                 for (temp_srv = hca->service_list; temp_srv != NULL;
3853                         temp_srv = temp_srv->srv_next) {
3854                                 cmn_err(CE_NOTE, "NFS-IB, active on port:"
3855                                         " %d\n"
3856                                         "Using name: %s", temp_srv->srv_port,
3857                                         temp_srv->srv_name);
3858                 }
3859         }
3860 #endif
3861 
3862         ibt_free_portinfo(port_infop, port_size);
3863         for (i = 0; i < addr_count; i++) {
3864                 if (addrs[i])
3865                         kmem_free(addrs[i], IB_SVC_NAME_LEN);
3866         }
3867         kmem_free(addrs, addr_count * sizeof (char *));
3868 
3869         if (nbinds == 0) {
3870 #ifdef  DEBUG
3871             if (rib_debug) {
3872                 cmn_err(CE_WARN, "rib_register_service: "
3873                     "bind_service FAILED!\n");
3874             }
3875 #endif
3876                 return (RDMA_FAILED);
3877         } else {
3878                 /*
3879                  * Put this plugin into accept state, since atleast
3880                  * one registration was successful.
3881                  */
3882                 mutex_enter(&plugin_state_lock);
3883                 plugin_state = ACCEPT;
3884                 mutex_exit(&plugin_state_lock);
3885                 return (RDMA_SUCCESS);
3886         }
3887 }
3888 
3889 void
3890 rib_listen(struct rdma_svc_data *rd)
3891 {
3892         rdma_stat status = RDMA_SUCCESS;
3893 
3894         rd->active = 0;
3895         rd->err_code = RDMA_FAILED;
3896 
3897         /*
3898          * First check if a hca is still attached
3899          */
3900         rw_enter(&rib_stat->hca->state_lock, RW_READER);
3901         if (rib_stat->hca->state != HCA_INITED) {
3902                 rw_exit(&rib_stat->hca->state_lock);
3903                 return;
3904         }
3905         rw_exit(&rib_stat->hca->state_lock);
3906 
3907         rib_stat->q = &rd->q;
3908         /*
3909          * Register the Address translation service
3910          */
3911         mutex_enter(&rib_stat->open_hca_lock);
3912         if (ats_running == 0) {
3913                 if (rib_register_ats(rib_stat->hca) != RDMA_SUCCESS) {
3914 #ifdef  DEBUG
3915                     if (rib_debug) {
3916                         cmn_err(CE_WARN,
3917                             "rib_listen(): ats registration failed!");
3918                     }
3919 #endif
3920                     mutex_exit(&rib_stat->open_hca_lock);
3921                     return;
3922                 } else {
3923                         ats_running = 1;
3924                 }
3925         }
3926         mutex_exit(&rib_stat->open_hca_lock);
3927 
3928         /*
3929          * Right now the only service type is NFS. Hence force feed this
3930          * value. Ideally to communicate the service type it should be
3931          * passed down in rdma_svc_data.
3932          */
3933         rib_stat->service_type = NFS;
3934         status = rib_register_service(rib_stat->hca, NFS);
3935         if (status != RDMA_SUCCESS) {
3936                 rd->err_code = status;
3937                 return;
3938         }
3939         /*
3940          * Service active on an HCA, check rd->err_code for more
3941          * explainable errors.
3942          */
3943         rd->active = 1;
3944         rd->err_code = status;
3945 }
3946 
3947 /* XXXX */
3948 /* ARGSUSED */
3949 static void
3950 rib_listen_stop(struct rdma_svc_data *svcdata)
3951 {
3952         rib_hca_t               *hca;
3953 
3954         /*
3955          * KRPC called the RDMATF to stop the listeners, this means
3956          * stop sending incomming or recieved requests to KRPC master
3957          * transport handle for RDMA-IB. This is also means that the
3958          * master transport handle, responsible for us, is going away.
3959          */
3960         mutex_enter(&plugin_state_lock);
3961         plugin_state = NO_ACCEPT;
3962         if (svcdata != NULL)
3963                 svcdata->active = 0;
3964         mutex_exit(&plugin_state_lock);
3965 
3966         /*
3967          * First check if a hca is still attached
3968          */
3969         hca = rib_stat->hca;
3970         rw_enter(&hca->state_lock, RW_READER);
3971         if (hca->state != HCA_INITED) {
3972                 rw_exit(&hca->state_lock);
3973                 return;
3974         }
3975         rib_stop_services(hca);
3976         rw_exit(&hca->state_lock);
3977 }
3978 
3979 /*
3980  * Traverse the HCA's service list to unbind and deregister services.
3981  * Instead of unbinding the service for a service handle by
3982  * calling ibt_unbind_service() for each port/pkey, we unbind
3983  * all the services for the service handle by making only one
3984  * call to ibt_unbind_all_services().  Then, we deregister the
3985  * service for the service handle.
3986  *
3987  * When traversing the entries in service_list, we compare the
3988  * srv_hdl of the current entry with that of the next.  If they
3989  * are different or if the next entry is NULL, the current entry
3990  * marks the last binding of the service handle.  In this case,
3991  * call ibt_unbind_all_services() and deregister the service for
3992  * the service handle.  If they are the same, the current and the
3993  * next entries are bound to the same service handle.  In this
3994  * case, move on to the next entry.
3995  */
3996 static void
3997 rib_stop_services(rib_hca_t *hca)
3998 {
3999         rib_service_t           *srv_list, *to_remove;
4000         ibt_status_t            ibt_status;
4001 
4002         /*
4003          * unbind and deregister the services for this service type.
4004          * Right now there is only one service type. In future it will
4005          * be passed down to this function.
4006          */
4007         rw_enter(&hca->service_list_lock, RW_WRITER);
4008         srv_list = hca->service_list;
4009         while (srv_list != NULL) {
4010                 to_remove = srv_list;
4011                 srv_list = to_remove->srv_next;
4012                 if (srv_list == NULL || bcmp(to_remove->srv_hdl,
4013                     srv_list->srv_hdl, sizeof (ibt_srv_hdl_t))) {
4014 
4015                     ibt_status = ibt_unbind_all_services(to_remove->srv_hdl);
4016                     if (ibt_status != IBT_SUCCESS) {
4017                         cmn_err(CE_WARN, "rib_listen_stop: "
4018                             "ibt_unbind_all_services FAILED"
4019                                 " status: %d\n", ibt_status);
4020                     }
4021 
4022                     ibt_status =
4023                         ibt_deregister_service(hca->ibt_clnt_hdl,
4024                                 to_remove->srv_hdl);
4025                     if (ibt_status != IBT_SUCCESS) {
4026                         cmn_err(CE_WARN, "rib_listen_stop: "
4027                             "ibt_deregister_service FAILED"
4028                                 " status: %d\n", ibt_status);
4029                     }
4030 
4031 #ifdef  DEBUG
4032                     if (rib_debug > 1) {
4033                         if (ibt_status == IBT_SUCCESS)
4034                                 cmn_err(CE_NOTE, "rib_listen_stop: "
4035                                     "Successfully stopped and"
4036                                     " UNREGISTERED service: %s\n",
4037                                     to_remove->srv_name);
4038                     }
4039 #endif
4040                 }
4041                 kmem_free(to_remove->srv_name, IB_SVC_NAME_LEN);
4042                 kmem_free(to_remove->srv_sbind_hdl,
4043                         sizeof (ibt_sbind_hdl_t));
4044 
4045                 kmem_free(to_remove, sizeof (rib_service_t));
4046         }
4047         hca->service_list = NULL;
4048         rw_exit(&hca->service_list_lock);
4049 }
4050 
4051 static struct svc_recv *
4052 rib_init_svc_recv(rib_qp_t *qp, ibt_wr_ds_t *sgl)
4053 {
4054         struct svc_recv *recvp;
4055 
4056         recvp = kmem_zalloc(sizeof (struct svc_recv), KM_SLEEP);
4057         recvp->vaddr = sgl->ds_va;
4058         recvp->qp = qp;
4059         recvp->bytes_xfer = 0;
4060         return (recvp);
4061 }
4062 
4063 static int
4064 rib_free_svc_recv(struct svc_recv *recvp)
4065 {
4066         kmem_free(recvp, sizeof (*recvp));
4067 
4068         return (0);
4069 }
4070 
4071 static struct reply *
4072 rib_addreplylist(rib_qp_t *qp, uint32_t msgid)
4073 {
4074         struct reply    *rep;
4075 
4076 
4077         rep = kmem_zalloc(sizeof (struct reply), KM_NOSLEEP);
4078         if (rep == NULL) {
4079                 mutex_exit(&qp->replylist_lock);
4080                 cmn_err(CE_WARN, "rib_addreplylist: no memory\n");
4081                 return (NULL);
4082         }
4083         rep->xid = msgid;
4084         rep->vaddr_cq = NULL;
4085         rep->bytes_xfer = 0;
4086         rep->status = (uint_t)REPLY_WAIT;
4087         rep->prev = NULL;
4088         cv_init(&rep->wait_cv, NULL, CV_DEFAULT, NULL);
4089 
4090         mutex_enter(&qp->replylist_lock);
4091         if (qp->replylist) {
4092                 rep->next = qp->replylist;
4093                 qp->replylist->prev = rep;
4094         }
4095         qp->rep_list_size++;
4096         if (rib_debug > 1)
4097             cmn_err(CE_NOTE, "rib_addreplylist: qp:%p, rep_list_size:%d\n",
4098                 (void *)qp, qp->rep_list_size);
4099         qp->replylist = rep;
4100         mutex_exit(&qp->replylist_lock);
4101 
4102         return (rep);
4103 }
4104 
4105 static rdma_stat
4106 rib_rem_replylist(rib_qp_t *qp)
4107 {
4108         struct reply    *r, *n;
4109 
4110         mutex_enter(&qp->replylist_lock);
4111         for (r = qp->replylist; r != NULL; r = n) {
4112                 n = r->next;
4113                 (void) rib_remreply(qp, r);
4114         }
4115         mutex_exit(&qp->replylist_lock);
4116 
4117         return (RDMA_SUCCESS);
4118 }
4119 
4120 static int
4121 rib_remreply(rib_qp_t *qp, struct reply *rep)
4122 {
4123 
4124         ASSERT(MUTEX_HELD(&qp->replylist_lock));
4125         if (rep->prev) {
4126                 rep->prev->next = rep->next;
4127         }
4128         if (rep->next) {
4129                 rep->next->prev = rep->prev;
4130         }
4131         if (qp->replylist == rep)
4132                 qp->replylist = rep->next;
4133 
4134         cv_destroy(&rep->wait_cv);
4135         qp->rep_list_size--;
4136         if (rib_debug > 1)
4137             cmn_err(CE_NOTE, "rib_remreply: qp:%p, rep_list_size:%d\n",
4138                 (void *)qp, qp->rep_list_size);
4139 
4140         kmem_free(rep, sizeof (*rep));
4141 
4142         return (0);
4143 }
4144 
4145 rdma_stat
4146 rib_registermem(CONN *conn,  caddr_t adsp, caddr_t buf, uint_t buflen, 
4147         struct mrc *buf_handle)
4148 {
4149         ibt_mr_hdl_t    mr_hdl = NULL;  /* memory region handle */
4150 #ifdef IB_FMR_SUP
4151         ibt_pmr_desc_t  pmr_desc;       /* vaddr, lkey, rkey */
4152         ibt_ma_hdl_t    ma_hdl = NULL;
4153 #endif
4154         ibt_mr_desc_t   mr_desc;        /* vaddr, lkey, rkey */
4155         rdma_stat       status;
4156         rib_hca_t       *hca = (ctoqp(conn))->hca;
4157 
4158         /*
4159          * Note: ALL buffer pools use the same memory type RDMARW.
4160          */
4161 #ifdef IB_FMR_SUP 
4162         status = rib_reg_mem_fmr(hca, adsp, buf, buflen, 0, &mr_hdl, &ma_hdl, 
4163             &pmr_desc); 
4164         if (status == RDMA_SUCCESS) {
4165                 buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
4166                 buf_handle->mrc_lmr = (uint32_t)pmr_desc.pmd_lkey;
4167                 buf_handle->mrc_rmr = (uint32_t)pmr_desc.pmd_rkey;
4168                 buf_handle->mrc_lma = (uintptr_t)ma_hdl;
4169                 goto ret_stat;
4170         } else {
4171                 buf_handle->mrc_linfo = NULL;
4172                 buf_handle->mrc_lma = NULL;
4173                 buf_handle->mrc_lmr = 0;
4174                 buf_handle->mrc_rmr = 0;
4175         }
4176 #endif
4177         status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
4178         if (status == RDMA_SUCCESS) {
4179                 buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
4180                 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
4181                 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
4182         } else {
4183                 buf_handle->mrc_linfo = NULL;
4184                 buf_handle->mrc_lmr = 0;
4185                 buf_handle->mrc_rmr = 0;
4186         }
4187         ret_stat:
4188         return (status);
4189 }
4190 
4191 #ifdef IB_FMR_SUP
4192 static rdma_stat
4193 rib_reg_mem_fmr(rib_hca_t *hca, caddr_t adsp, caddr_t buf, uint_t size, ibt_mr_flags_t spec, 
4194         ibt_mr_hdl_t *mr_hdlp, ibt_ma_hdl_t *ma_hdlp, ibt_pmr_desc_t *pmr_descp) 
4195 { 
4196         ibt_va_attr_t   va_attr; 
4197         ibt_phys_buf_t  *paddr_list; 
4198         uint_t          paddr_list_len, num_paddr; 
4199         size_t          buf_sz = 0; 
4200         ibt_pmr_attr_t  pmr_attr; 
4201         ib_memlen_t     paddr_offset; 
4202         ibt_status_t    ibt_status; 
4203         uint_t          h_page_sz; 
4204         if(adsp)  
4205                 return(RDMA_FAILED); 
4206         bzero(&va_attr, sizeof (ibt_va_attr_t)); 
4207         va_attr.va_vaddr = (ib_vaddr_t)buf; 
4208         va_attr.va_len   = size; 
4209         va_attr.va_as    = (struct as *)(caddr_t)adsp; 
4210         va_attr.va_flags = IBT_VA_FMR | IBT_VA_SLEEP; 
4211         if (spec == IBT_MR_NONCOHERENT) 
4212                 va_attr.va_flags |= IBT_VA_NONCOHERENT; 
4213         va_attr.va_phys_buf_min = va_attr.va_phys_buf_max = 0; 
4214  
4215         h_page_sz = hca->hca_attrs.hca_page_sz * 1024;  
4216         paddr_list_len = (size / h_page_sz) + 2; 
4217         paddr_list = (ibt_phys_buf_t *)kmem_zalloc(sizeof (ibt_phys_buf_t) * 
4218             paddr_list_len, KM_NOSLEEP); 
4219  
4220         if (rib_debug > 0) { 
4221                 cmn_err(CE_NOTE, "fmr: vaddr %p, size %d paddr_list_len %d \n", 
4222                     buf, size, paddr_list_len); 
4223         } 
4224  
4225         ibt_status = ibt_map_mem_area(hca->hca_hdl, &va_attr, paddr_list_len, 
4226             paddr_list, &num_paddr, &buf_sz, &paddr_offset, ma_hdlp); 
4227         if (ibt_status != IBT_SUCCESS) { 
4228                 cmn_err(CE_WARN, "rib_reg_mem_fmr: ibt_map_mem_area failed: " 
4229                     "status %d", ibt_status); 
4230                 kmem_free(paddr_list, sizeof (ibt_phys_buf_t) * paddr_list_len); 
4231                 return (RDMA_FAILED); 
4232         } 
4233  
4234         if (rib_debug > 0) {  
4235                 cmn_err(CE_NOTE,"fmr: p_laddr %p, p_size %d, buf_sz %d, p_ofset %llX\n", 
4236                     paddr_list[0].p_laddr, paddr_list[0].p_size, buf_sz, 
4237                     paddr_offset); 
4238                 cmn_err(CE_NOTE,"fmr: ibt_map_mem_area: ret %d, num_paddr %d, spec %d\n", 
4239                     ibt_status, num_paddr, spec); 
4240      } 
4241  
4242         bzero(&pmr_attr, sizeof (ibt_pmr_attr_t)); 
4243         pmr_attr.pmr_iova = (ib_vaddr_t)buf; 
4244         pmr_attr.pmr_len = size; 
4245         pmr_attr.pmr_num_buf = num_paddr; 
4246         pmr_attr.pmr_buf_sz = buf_sz; 
4247         pmr_attr.pmr_buf_list = paddr_list; 
4248         pmr_attr.pmr_offset = paddr_offset; 
4249         pmr_attr.pmr_flags = spec; 
4250         pmr_attr.pmr_ma = *ma_hdlp; 
4251  
4252         ibt_status = ibt_register_physical_fmr(hca->hca_hdl, hca->fmr_pool, 
4253             &pmr_attr, mr_hdlp, pmr_descp); 
4254         if (ibt_status != IBT_SUCCESS) { 
4255                 cmn_err(CE_WARN, "rib_reg_mem_fmr: ibt_register_physical_fmr " 
4256                     "failed: status %d", ibt_status); 
4257                 (void) ibt_unmap_mem_area(hca->hca_hdl, *ma_hdlp); 
4258                 *ma_hdlp=NULL; 
4259                 kmem_free(paddr_list, sizeof (ibt_phys_buf_t) * paddr_list_len); 
4260                 return (RDMA_FAILED); 
4261         } 
4262  
4263         if (rib_debug > 0) { 
4264                 cmn_err(CE_NOTE,"fmr: rkey: 0x%lX  lkey: 0x%lX, iova: %p, fmr_hdl %p \n", 
4265                     pmr_descp->pmd_rkey, pmr_descp->pmd_lkey, 
4266                     pmr_descp->pmd_iova, *mr_hdlp); 
4267         } 
4268  
4269         kmem_free(paddr_list, sizeof (ibt_phys_buf_t) * paddr_list_len); 
4270  
4271         return (RDMA_SUCCESS); 
4272          
4273 } 
4274  
4275 #endif 
4276 static rdma_stat 
4277 rib_reg_mem(rib_hca_t *hca,   caddr_t adsp, caddr_t buf, uint_t size, ibt_mr_flags_t spec, 
4278         ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp)
4279 {
4280         ibt_mr_attr_t   mem_attr;
4281         ibt_status_t    ibt_status;

4282         mem_attr.mr_vaddr = (uintptr_t)buf;
4283         mem_attr.mr_len = (ib_msglen_t)size;
4284         mem_attr.mr_as = (struct as *)(caddr_t)adsp; 
4285         mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE |
4286             IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE |
4287             IBT_MR_ENABLE_WINDOW_BIND | spec;
4288 
4289         rw_enter(&hca->state_lock, RW_READER);
4290         if (hca->state == HCA_INITED) {
4291                 ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl,
4292                                         &mem_attr, mr_hdlp, mr_descp);
4293                 rw_exit(&hca->state_lock);
4294         } else {
4295                 rw_exit(&hca->state_lock);
4296                 return (RDMA_FAILED);
4297         }
4298 
4299         if (ibt_status != IBT_SUCCESS) {
4300                 cmn_err(CE_WARN, "rib_reg_mem: ibt_register_mr "
4301                         "(spec:%d) failed for addr %llX, status %d",
4302                         spec, (longlong_t)mem_attr.mr_vaddr, ibt_status);
4303                 return (RDMA_FAILED);
4304         }
4305         return (RDMA_SUCCESS);
4306 }
4307 
4308 rdma_stat
4309 rib_registermemsync(CONN *conn,  caddr_t adsp, caddr_t buf, uint_t buflen, 
4310 #ifdef SERVER_REG_CACHE 
4311         struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, void *lrc) 
4312 #else 
4313         struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle)
4314 #endif
4315 {
4316         ibt_mr_hdl_t    mr_hdl = NULL;  /* memory region handle */
4317 #ifdef IB_FMR_SUP
4318         ibt_pmr_desc_t  pmr_desc;       /* vaddr, lkey, rkey */
4319         ibt_ma_hdl_t    ma_hdl = NULL;
4320 #endif
4321 #ifdef SERVER_REG_CACHE
4322         rib_lrc_entry_t *l;
4323 #endif
4324         ibt_mr_desc_t   mr_desc;        /* vaddr, lkey, rkey */
4325         rdma_stat       status;
4326         rib_hca_t       *hca = (ctoqp(conn))->hca;
4327 
4328         /*
4329          * Non-coherent memory registration.
4330          */
4331 #ifdef SERVER_REG_CACHE 
4332         l = (rib_lrc_entry_t *)lrc; 
4333         if(l){ 
4334                 if(l->registered){ 
4335                 buf_handle->mrc_linfo = (uintptr_t)l->lrc_mhandle.mrc_linfo; 
4336                 buf_handle->mrc_lmr   = (uint32_t)l->lrc_mhandle.mrc_lmr; 
4337                 buf_handle->mrc_rmr   = (uint32_t)l->lrc_mhandle.mrc_rmr; 
4338 #ifdef IB_FMR_SUP 
4339                 buf_handle->mrc_lma   = (uintptr_t)l->lrc_mhandle.mrc_lma; 
4340 #endif 
4341                 *sync_handle          = (RIB_SYNCMEM_HANDLE)l->lrc_mhandle.mrc_linfo; 
4342                 return(RDMA_SUCCESS); 
4343                 } else { 
4344                         /* Always register the whole buffer */ 
4345                         buf = (caddr_t)l->lrc_buf; 
4346                         buflen = l->lrc_len; 
4347                         /*cmn_err(CE_NOTE,"Register %p of length %d\n",buf,buflen);*/ 
4348                 } 
4349              } 
4350 #endif 
4351 #ifdef IB_FMR_SUP 
4352         status = rib_reg_mem_fmr(hca, adsp, buf, buflen, IBT_MR_NONCOHERENT, &mr_hdl, 
4353             &ma_hdl, &pmr_desc); 
4354         if (status == RDMA_SUCCESS) { 
4355                 buf_handle->mrc_linfo = (uintptr_t)mr_hdl; 
4356                 buf_handle->mrc_lma = (uintptr_t)ma_hdl; 
4357                 buf_handle->mrc_lmr = (uint32_t)pmr_desc.pmd_lkey; 
4358                 buf_handle->mrc_rmr = (uint32_t)pmr_desc.pmd_rkey; 
4359                 *sync_handle        = (RIB_SYNCMEM_HANDLE)mr_hdl; 
4360 #ifdef SERVER_REG_CACHE 
4361                 if(l){ 
4362                 l->lrc_mhandle.mrc_linfo = (uintptr_t)mr_hdl; 
4363                 l->lrc_mhandle.mrc_lmr   = (uint32_t)mr_desc.md_lkey; 
4364                 l->lrc_mhandle.mrc_rmr   = (uint32_t)mr_desc.md_rkey; 
4365                 l->registered                 = TRUE; 
4366                 l->lrc_mhandle.mrc_lma   = (uintptr_t)ma_hdl; 
4367                 } 
4368 #endif 
4369                 goto ret_stat; 
4370                  
4371         } else { 
4372                 if (rib_debug > 1) 
4373                         cmn_err(CE_WARN,"fmr reg failed for buffer %p of length %d\n",buf,buflen); 
4374                 buf_handle->mrc_linfo = NULL; 
4375                 buf_handle->mrc_lma = NULL; 
4376                 buf_handle->mrc_lmr = 0; 
4377                 buf_handle->mrc_rmr = 0; 
4378         } 
4379 #endif 
4380         status = rib_reg_mem(hca, adsp, buf, buflen, IBT_MR_NONCOHERENT, &mr_hdl, 
4381                         &mr_desc);
4382         if (status == RDMA_SUCCESS) {
4383 #ifdef SERVER_REG_CACHE
4384                 if(l){
4385                 l->lrc_mhandle.mrc_linfo = (uintptr_t)mr_hdl;
4386                 l->lrc_mhandle.mrc_lmr   = (uint32_t)mr_desc.md_lkey;
4387                 l->lrc_mhandle.mrc_rmr   = (uint32_t)mr_desc.md_rkey;
4388                 l->registered                 = TRUE;
4389                 }
4390 #endif
4391                 buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
4392                 buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
4393                 buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
4394                 *sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl;
4395         } else {
4396                 buf_handle->mrc_linfo = NULL;
4397                 buf_handle->mrc_lmr = 0;
4398                 buf_handle->mrc_rmr = 0;
4399         }
4400         ret_stat:
4401         return (status);
4402 }
4403 
4404 /* ARGSUSED */
4405 rdma_stat
4406 rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle)
4407 {
4408         avl_index_t where = NULL;
4409 #ifdef IB_FMR_SUP
4410         ibt_status_t    ibt_status;
4411 #endif
4412         rib_hca_t *hca = (ctoqp(conn))->hca;

4413         /*
4414          * Allow memory deregistration even if HCA is
4415          * getting detached. Need all outstanding
4416          * memory registrations to be deregistered
4417          * before HCA_DETACH_EVENT can be accepted.
4418          */
4419 #ifdef IB_FMR_SUP
4420         if(buf_handle.mrc_lma){
4421         ibt_status = ibt_unmap_mem_area(hca->hca_hdl,
4422             (ibt_ma_hdl_t)buf_handle.mrc_lma);
4423         if (ibt_status != IBT_SUCCESS){
4424                 cmn_err(CE_WARN,"rib_deregistermem: ibt_unmap_mem_area: %d failed",
4425                     ibt_status);
4426                 return (RDMA_FAILED);
4427                 }
4428 
4429         ibt_status = ibt_deregister_fmr(hca->hca_hdl,
4430             (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
4431         if (ibt_status != IBT_SUCCESS)
4432                                 return (RDMA_FAILED);
4433         return (RDMA_SUCCESS);
4434         }
4435 #endif
4436         (void) ibt_deregister_mr(hca->hca_hdl,
4437                         (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
4438         return (RDMA_SUCCESS);
4439 }
4440 
4441 /* ARGSUSED */
4442 rdma_stat
4443 rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle,
4444 #ifdef SERVER_REG_CACHE
4445                 RIB_SYNCMEM_HANDLE sync_handle, void *lrc)
4446 #else
4447                 RIB_SYNCMEM_HANDLE sync_handle)
4448 #endif
4449 {
4450 #ifdef SERVER_REG_CACHE
4451         rib_lrc_entry_t *l;
4452         l = (rib_lrc_entry_t *)lrc;
4453         if(l)
4454           if(l->registered)
4455                 return(RDMA_SUCCESS);
4456 #endif
4457 
4458 
4459         (void) rib_deregistermem(conn, buf, buf_handle);
4460 
4461         return (RDMA_SUCCESS);
4462 }
4463 
4464 /* ARGSUSED */
4465 rdma_stat
4466 rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf,
4467                 int len, int cpu)
4468 {
4469         ibt_status_t    status;
4470         rib_hca_t *hca = (ctoqp(conn))->hca;
4471         ibt_mr_sync_t   mr_segment;
4472 
4473         mr_segment.ms_handle = (ibt_mr_hdl_t)shandle;
4474         mr_segment.ms_vaddr = (ib_vaddr_t)(uintptr_t)buf;
4475         mr_segment.ms_len = (ib_memlen_t)len;
4476         if (cpu) {
4477                 /* make incoming data visible to memory */
4478                 mr_segment.ms_flags = IBT_SYNC_WRITE;
4479         } else {
4480                 /* make memory changes visible to IO */
4481                 mr_segment.ms_flags = IBT_SYNC_READ;
4482         }
4483         rw_enter(&hca->state_lock, RW_READER);
4484         if (hca->state == HCA_INITED) {
4485                 status = ibt_sync_mr(hca->hca_hdl, &mr_segment, 1);
4486                 rw_exit(&hca->state_lock);
4487         } else {
4488                 rw_exit(&hca->state_lock);
4489                 return (RDMA_FAILED);
4490         }
4491 
4492         if (status == IBT_SUCCESS)
4493                 return (RDMA_SUCCESS);
4494         else {
4495 #ifdef DEBUG
4496                 cmn_err(CE_WARN, "rib_syncmem: ibt_sync_mr failed with %d\n",
4497                         status);
4498 #endif
4499                 return (RDMA_FAILED);
4500         }
4501 }
4502 
4503 /*
4504  * XXXX ????
4505  */
4506 static rdma_stat
4507 rib_getinfo(rdma_info_t *info)
4508 {
4509         /*
4510          * XXXX Hack!
4511          */
4512         info->addrlen = 16;
4513         info->mts = 1000000;
4514         info->mtu = 1000000;
4515 
4516         return (RDMA_SUCCESS);
4517 }
4518 
4519 rib_bufpool_t *
4520 rib_rbufpool_create(rib_hca_t *hca, int ptype, int num)
4521 {
4522         rib_bufpool_t   *rbp = NULL;
4523         bufpool_t       *bp = NULL;
4524         caddr_t         buf;
4525         ibt_mr_attr_t   mem_attr;
4526         ibt_status_t    ibt_status;
4527         int             i, j;
4528 
4529         rbp = (rib_bufpool_t *)kmem_zalloc(sizeof (rib_bufpool_t), KM_SLEEP);
4530 
4531         bp = (bufpool_t *)kmem_zalloc(sizeof (bufpool_t) +
4532                         num * sizeof (void *), KM_SLEEP);
4533 
4534         mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock);
4535         bp->numelems = num;
4536 
4537 
4538         switch (ptype) {
4539             case SEND_BUFFER:
4540                 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;

4541                 bp->rsize = RPC_MSG_SZ;
4542                 break;
4543             case RECV_BUFFER:
4544                 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;

4545                 bp->rsize = RPC_BUF_SIZE;
4546                 break;
4547             default:
4548                 goto fail;
4549         }
4550 
4551         /*
4552          * Register the pool.
4553          */
4554         bp->bufsize = num * bp->rsize;
4555         bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP);
4556         rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num *
4557                         sizeof (ibt_mr_hdl_t), KM_SLEEP);
4558         rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num *
4559                         sizeof (ibt_mr_desc_t), KM_SLEEP);

4560         rw_enter(&hca->state_lock, RW_READER);
4561         if (hca->state != HCA_INITED) {
4562                 rw_exit(&hca->state_lock);
4563                 cmn_err(CE_WARN,"hca->state != HCA_INITED");
4564                 goto fail; 
4565         }
4566         for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) {
4567                 bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t));
4568                 mem_attr.mr_vaddr = (uintptr_t)buf;
4569                 mem_attr.mr_len = (ib_msglen_t)bp->rsize;
4570                 mem_attr.mr_as = NULL;
4571                 ibt_status = ibt_register_mr(hca->hca_hdl,
4572                         hca->pd_hdl, &mem_attr, &rbp->mr_hdl[i],
4573                         &rbp->mr_desc[i]);
4574                 if (ibt_status != IBT_SUCCESS) {
4575                     for (j = 0; j < i; j++) {
4576                         (void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[j]);
4577                     }
4578                     rw_exit(&hca->state_lock);
4579                     goto fail;
4580                 }
4581         }
4582         rw_exit(&hca->state_lock);

4583         buf = (caddr_t)bp->buf;
4584         for (i = 0; i < num; i++, buf += bp->rsize) {
4585                 bp->buflist[i] = (void *)buf;
4586         }
4587         bp->buffree = num - 1;       /* no. of free buffers */
4588         rbp->bpool = bp;
4589 
4590         return (rbp);
4591 fail:
4592         if (bp) {
4593             if (bp->buf)
4594                 kmem_free(bp->buf, bp->bufsize);
4595             kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *));
4596         }
4597         if (rbp) {
4598             if (rbp->mr_hdl)
4599                 kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t));
4600             if (rbp->mr_desc)
4601                 kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t));
4602             kmem_free(rbp, sizeof (rib_bufpool_t));
4603         }
4604         return (NULL);
4605 }
4606 
4607 static void
4608 rib_rbufpool_deregister(rib_hca_t *hca, int ptype)
4609 {
4610         int i;
4611         rib_bufpool_t *rbp = NULL;
4612         bufpool_t *bp;
4613 
4614         /*
4615          * Obtain pool address based on type of pool
4616          */
4617         switch (ptype) {
4618                 case SEND_BUFFER:
4619                         rbp = hca->send_pool;
4620                         break;
4621                 case RECV_BUFFER:
4622                         rbp = hca->recv_pool;
4623                         break;
4624                 default:
4625                         return;
4626         }
4627         if (rbp == NULL)
4628                 return;
4629 
4630         bp = rbp->bpool;
4631 
4632         /*
4633          * Deregister the pool memory and free it.
4634          */
4635         for (i = 0; i < bp->numelems; i++) {
4636                 (void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[i]);
4637         }
4638 }
4639 
4640 static void
4641 rib_rbufpool_free(rib_hca_t *hca, int ptype)
4642 {
4643 
4644         rib_bufpool_t *rbp = NULL;
4645         bufpool_t *bp;
4646 
4647         /*
4648          * Obtain pool address based on type of pool
4649          */
4650         switch (ptype) {
4651                 case SEND_BUFFER:
4652                         rbp = hca->send_pool;
4653                         break;
4654                 case RECV_BUFFER:
4655                         rbp = hca->recv_pool;
4656                         break;
4657                 default:
4658                         return;
4659         }
4660         if (rbp == NULL)
4661                 return;
4662 
4663         bp = rbp->bpool;
4664 
4665         /*
4666          * Free the pool memory.
4667          */
4668         if (rbp->mr_hdl)
4669                 kmem_free(rbp->mr_hdl, bp->numelems*sizeof (ibt_mr_hdl_t));
4670 
4671         if (rbp->mr_desc)
4672                 kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t));

4673         if (bp->buf)
4674                 kmem_free(bp->buf, bp->bufsize);
4675         mutex_destroy(&bp->buflock);
4676         kmem_free(bp, sizeof (bufpool_t) + bp->numelems*sizeof (void *));
4677         kmem_free(rbp, sizeof (rib_bufpool_t));
4678 }
4679 
4680 void
4681 rib_rbufpool_destroy(rib_hca_t *hca, int ptype)
4682 {
4683         /*
4684          * Deregister the pool memory and free it.
4685          */
4686         rib_rbufpool_deregister(hca, ptype);
4687         rib_rbufpool_free(hca, ptype);
4688 }
4689 
4690 /*
4691  * Fetch a buffer from the pool of type specified in rdbuf->type.
4692  */
4693 static rdma_stat
4694 rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf)
4695 {
4696 
4697         rdbuf->addr = rib_rbuf_alloc(conn, rdbuf);
4698         if (rdbuf->addr) {
4699                 switch (rdbuf->type) {
4700                 case SEND_BUFFER:
4701                         rdbuf->len = RPC_MSG_SZ;     /* 1K */
4702                         break;
4703                 case RECV_BUFFER:
4704                         rdbuf->len = RPC_BUF_SIZE; /* 2K */
4705                         break;
4706                 default:
4707                         rdbuf->len = 0;
4708                 }
4709                 return (RDMA_SUCCESS);
4710         } else
4711                 return (RDMA_FAILED);
4712 }
4713 
4714 #if defined(MEASURE_POOL_DEPTH)
4715 static void rib_recv_bufs(uint32_t x) {
4716 return;
4717 }
4718 static void rib_send_bufs(uint32_t x) {
4719 return;
4720 }
4721 #endif
4722 
4723 /*
4724  * Fetch a buffer of specified type.
4725  * Note that rdbuf->handle is mw's rkey.
4726  */
4727 static void *
4728 rib_rbuf_alloc(CONN *conn, rdma_buf_t *rdbuf)
4729 {
4730         rib_qp_t        *qp = ctoqp(conn);
4731         rib_hca_t       *hca = qp->hca;
4732         rdma_btype      ptype = rdbuf->type;
4733         void            *buf;
4734         rib_bufpool_t   *rbp = NULL;
4735         bufpool_t       *bp;
4736         int             i;
4737 
4738         /*
4739          * Obtain pool address based on type of pool
4740          */
4741         switch (ptype) {
4742                 case SEND_BUFFER:
4743                         rbp = hca->send_pool;
4744                         break;
4745                 case RECV_BUFFER:
4746                         rbp = hca->recv_pool;
4747                         break;
4748                 default:
4749                         return (NULL);
4750         }
4751         if (rbp == NULL)
4752                 return (NULL);
4753 
4754         bp = rbp->bpool;
4755 
4756         mutex_enter(&bp->buflock);
4757         if (bp->buffree < 0) {
4758                 cmn_err(CE_WARN, "rib_rbuf_alloc: No free buffers!");
4759                 mutex_exit(&bp->buflock);
4760                 return (NULL);
4761         }
4762 
4763         /* XXXX put buf, rdbuf->handle.mrc_rmr, ... in one place. */
4764         buf = bp->buflist[bp->buffree];
4765         rdbuf->addr = buf;
4766         rdbuf->len = bp->rsize;
4767         for (i = bp->numelems - 1; i >= 0; i--) {
4768             if ((ib_vaddr_t)(uintptr_t)buf == rbp->mr_desc[i].md_vaddr) {
4769                 rdbuf->handle.mrc_rmr = (uint32_t)rbp->mr_desc[i].md_rkey;
4770                 rdbuf->handle.mrc_linfo = (uintptr_t)rbp->mr_hdl[i];
4771                 rdbuf->handle.mrc_lmr = (uint32_t)rbp->mr_desc[i].md_lkey;
4772 #if defined(MEASURE_POOL_DEPTH)
4773                 if(ptype == SEND_BUFFER)
4774                 rib_send_bufs(MAX_BUFS - (bp->buffree+1));
4775                 if(ptype == RECV_BUFFER)
4776                 rib_recv_bufs(MAX_BUFS - (bp->buffree+1));
4777 #endif
4778                 bp->buffree--;
4779                 if (rib_debug > 1)
4780                     cmn_err(CE_NOTE, "rib_rbuf_alloc: %d free bufs "
4781                         "(type %d)\n", bp->buffree+1, ptype);
4782 
4783                 mutex_exit(&bp->buflock);
4784 
4785                 return (buf);
4786             }
4787         }
4788         cmn_err(CE_WARN, "rib_rbuf_alloc: NO matching buf %p of "
4789                 "type %d found!", buf, ptype);
4790         mutex_exit(&bp->buflock);
4791 
4792         return (NULL);
4793 }
4794 
4795 static void
4796 rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf)
4797 {
4798 
4799         rib_rbuf_free(conn, rdbuf->type, rdbuf->addr);
4800 }
4801 
4802 static void
4803 rib_rbuf_free(CONN *conn, int ptype, void *buf)
4804 {
4805         rib_qp_t *qp = ctoqp(conn);
4806         rib_hca_t *hca = qp->hca;
4807         rib_bufpool_t *rbp = NULL;
4808         bufpool_t *bp;
4809 
4810         /*
4811          * Obtain pool address based on type of pool
4812          */
4813         switch (ptype) {
4814                 case SEND_BUFFER:
4815                         rbp = hca->send_pool;
4816                         break;
4817                 case RECV_BUFFER:
4818                         rbp = hca->recv_pool;
4819                         break;
4820                 default:
4821                         return;
4822         }
4823         if (rbp == NULL)
4824                 return;
4825 
4826         bp = rbp->bpool;
4827 
4828         mutex_enter(&bp->buflock);
4829         if (++bp->buffree >= bp->numelems) {
4830                 /*
4831                  * Should never happen
4832                  */
4833                 cmn_err(CE_WARN, "rib_rbuf_free: One (type %d) "
4834                         "too many frees!", ptype);
4835                 bp->buffree--;
4836         } else {
4837                 bp->buflist[bp->buffree] = buf;
4838                 if (rib_debug > 1)
4839                     cmn_err(CE_NOTE, "rib_rbuf_free: %d free bufs "
4840                         "(type %d)\n", bp->buffree+1, ptype);
4841         }
4842         mutex_exit(&bp->buflock);
4843 }
4844 
4845 static rdma_stat
4846 rib_add_connlist(CONN *cn, rib_conn_list_t *connlist)
4847 {
4848         rw_enter(&connlist->conn_lock, RW_WRITER);
4849         if (connlist->conn_hd) {
4850                 cn->c_next = connlist->conn_hd;
4851                 connlist->conn_hd->c_prev = cn;
4852         }
4853         connlist->conn_hd = cn;
4854         rw_exit(&connlist->conn_lock);
4855 
4856         return (RDMA_SUCCESS);
4857 }
4858 
4859 static rdma_stat
4860 rib_rm_conn(CONN *cn, rib_conn_list_t *connlist)
4861 {
4862         rw_enter(&connlist->conn_lock, RW_WRITER);
4863         if (cn->c_prev) {
4864                 cn->c_prev->c_next = cn->c_next;
4865         }
4866         if (cn->c_next) {
4867                 cn->c_next->c_prev = cn->c_prev;
4868         }
4869         if (connlist->conn_hd == cn)
4870                 connlist->conn_hd = cn->c_next;
4871         rw_exit(&connlist->conn_lock);
4872 
4873         return (RDMA_SUCCESS);
4874 }
4875 
4876 /*
4877  * Connection management.
4878  * IBTF does not support recycling of channels. So connections are only
4879  * in four states - C_CONN_PEND, or C_CONNECTED, or C_ERROR or
4880  * C_DISCONN_PEND state. No C_IDLE state.
4881  * C_CONN_PEND state: Connection establishment in progress to the server.
4882  * C_CONNECTED state: A connection when created is in C_CONNECTED state.
4883  * It has an RC channel associated with it. ibt_post_send/recv are allowed
4884  * only in this state.
4885  * C_ERROR state: A connection transitions to this state when WRs on the
4886  * channel are completed in error or an IBT_CM_EVENT_CONN_CLOSED event
4887  * happens on the channel or a IBT_HCA_DETACH_EVENT occurs on the HCA.
4888  * C_DISCONN_PEND state: When a connection is in C_ERROR state and when
4889  * c_ref drops to 0 (this indicates that RPC has no more references to this
4890  * connection), the connection should be destroyed. A connection transitions
4891  * into this state when it is being destroyed.
4892  */
4893 static rdma_stat
4894 rib_conn_get(struct netbuf *svcaddr, int addr_type, void *handle, CONN **conn)
4895 {
4896         CONN *cn;
4897         int status = RDMA_SUCCESS;
4898         rib_hca_t *hca = (rib_hca_t *)handle;
4899         rib_qp_t *qp;
4900         clock_t cv_stat, timout;
4901         ibt_path_info_t path;
4902 
4903 again:
4904         rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
4905         cn = hca->cl_conn_list.conn_hd;
4906         while (cn != NULL) {
4907                 /*
4908                  * First, clear up any connection in the ERROR state
4909                  */
4910                 mutex_enter(&cn->c_lock);
4911                 if (cn->c_state == C_ERROR) {
4912                         if (cn->c_ref == 0) {
4913                                 /*
4914                                  * Remove connection from list and destroy it.
4915                                  */
4916                                 cn->c_state = C_DISCONN_PEND;
4917                                 mutex_exit(&cn->c_lock);
4918                                 rw_exit(&hca->cl_conn_list.conn_lock);
4919                                 (void) rib_disconnect_channel(cn,
4920                                     &hca->cl_conn_list);
4921                                 goto again;
4922                         }
4923                         mutex_exit(&cn->c_lock);
4924                         cn = cn->c_next;
4925                         continue;
4926                 } else if (cn->c_state == C_DISCONN_PEND) {
4927                         mutex_exit(&cn->c_lock);
4928                         cn = cn->c_next;
4929                         continue;
4930                 }
4931                 if ((cn->c_raddr.len == svcaddr->len) &&
4932                     bcmp(svcaddr->buf, cn->c_raddr.buf, svcaddr->len) == 0) {
4933                         /*
4934                          * Our connection. Give up conn list lock
4935                          * as we are done traversing the list.
4936                          */
4937                         rw_exit(&hca->cl_conn_list.conn_lock);
4938                         if (cn->c_state == C_CONNECTED) {
4939                                 cn->c_ref++; /* sharing a conn */
4940                                 mutex_exit(&cn->c_lock);
4941                                 *conn = cn;
4942                                 return (status);
4943                         }
4944                         if (cn->c_state == C_CONN_PEND) {
4945                                 /*
4946                                  * Hold a reference to this conn before
4947                                  * we give up the lock.
4948                                  */
4949                                 cn->c_ref++;
4950                                 timout =  ddi_get_lbolt() +
4951                                     drv_usectohz(CONN_WAIT_TIME * 1000000);
4952                                 while ((cv_stat = cv_timedwait_sig(&cn->c_cv,
4953                                         &cn->c_lock, timout)) > 0 &&
4954                                         cn->c_state == C_CONN_PEND)
4955                                         ;
4956                                 if (cv_stat == 0) {
4957                                         cn->c_ref--;
4958                                         mutex_exit(&cn->c_lock);
4959                                         return (RDMA_INTR);
4960                                 }
4961                                 if (cv_stat < 0) {
4962                                         cn->c_ref--;
4963                                         mutex_exit(&cn->c_lock);
4964                                         return (RDMA_TIMEDOUT);
4965                                 }
4966                                 if (cn->c_state == C_CONNECTED) {
4967                                         *conn = cn;
4968                                         mutex_exit(&cn->c_lock);
4969                                         return (status);
4970                                 } else {
4971                                         cn->c_ref--;
4972                                         mutex_exit(&cn->c_lock);
4973                                         return (RDMA_TIMEDOUT);
4974                                 }
4975                         }
4976                 }
4977                 mutex_exit(&cn->c_lock);
4978                 cn = cn->c_next;
4979         }
4980         rw_exit(&hca->cl_conn_list.conn_lock);
4981 
4982         status = rib_chk_srv_ats(hca, svcaddr, addr_type, &path);
4983         if (status != RDMA_SUCCESS) {
4984 #ifdef DEBUG
4985                 if (rib_debug) {
4986                         cmn_err(CE_WARN, "rib_conn_get: "
4987                                 "No server ATS record!");
4988                 }
4989 #endif
4990                 return (RDMA_FAILED);
4991         }
4992 
4993         /*
4994          * Channel to server doesn't exist yet, create one.
4995          */
4996         if (rib_clnt_create_chan(hca, svcaddr, &qp) != RDMA_SUCCESS) {
4997                 return (RDMA_FAILED);
4998         }
4999         cn = qptoc(qp);
5000         cn->c_state = C_CONN_PEND;
5001         cn->c_ref = 1;
5002 
5003         /*
5004          * Add to conn list.
5005          * We had given up the READER lock. In the time since then,
5006          * another thread might have created the connection we are
5007          * trying here. But for now, that is quiet alright - there
5008          * might be two connections between a pair of hosts instead
5009          * of one. If we really want to close that window,
5010          * then need to check the list after acquiring the
5011          * WRITER lock.
5012          */
5013         (void) rib_add_connlist(cn, &hca->cl_conn_list);
5014         status = rib_conn_to_srv(hca, qp, &path);
5015         mutex_enter(&cn->c_lock);
5016         if (status == RDMA_SUCCESS) {
5017                 cn->c_state = C_CONNECTED;
5018                 *conn = cn;
5019         } else {
5020                 cn->c_state = C_ERROR;
5021                 cn->c_ref--;
5022 #ifdef DEBUG
5023                 if (rib_debug) {
5024                         cmn_err(CE_WARN, "rib_conn_get: FAILED creating"
5025                             " a channel!");
5026                 }
5027 #endif
5028         }
5029         cv_broadcast(&cn->c_cv);
5030         mutex_exit(&cn->c_lock);
5031         return (status);
5032 }
5033 
5034 static rdma_stat
5035 rib_conn_release(CONN *conn)
5036 {
5037         rib_qp_t        *qp = ctoqp(conn);
5038 
5039         mutex_enter(&conn->c_lock);
5040         conn->c_ref--;
5041 
5042         /*
5043          * If a conn is C_ERROR, close the channel.
5044          * If it's CONNECTED, keep it that way.
5045          */
5046         if (conn->c_ref == 0 && (conn->c_state &  C_ERROR)) {
5047                 conn->c_state = C_DISCONN_PEND;
5048                 mutex_exit(&conn->c_lock);
5049                 if (qp->mode == RIB_SERVER)
5050                         (void) rib_disconnect_channel(conn,
5051                             &qp->hca->srv_conn_list);
5052                 else
5053                         (void) rib_disconnect_channel(conn,
5054                             &qp->hca->cl_conn_list);
5055                 return (RDMA_SUCCESS);
5056         }
5057         mutex_exit(&conn->c_lock);
5058         return (RDMA_SUCCESS);
5059 }
5060 
5061 /*
5062  * Add at front of list
5063  */
5064 static struct rdma_done_list *
5065 rdma_done_add(rib_qp_t *qp, uint32_t xid)
5066 {
5067         struct rdma_done_list *rd;
5068 
5069         ASSERT(MUTEX_HELD(&qp->rdlist_lock));
5070 
5071         rd = kmem_alloc(sizeof (*rd), KM_SLEEP);
5072         rd->xid = xid;
5073         cv_init(&rd->rdma_done_cv, NULL, CV_DEFAULT, NULL);
5074 
5075         rd->prev = NULL;
5076         rd->next = qp->rdlist;
5077         if (qp->rdlist != NULL)
5078                 qp->rdlist->prev = rd;
5079         qp->rdlist = rd;
5080 
5081         return (rd);
5082 }
5083 
5084 static void
5085 rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd)
5086 {
5087         struct rdma_done_list *r;
5088 
5089         ASSERT(MUTEX_HELD(&qp->rdlist_lock));
5090 
5091         r = rd->next;
5092         if (r != NULL) {
5093                 r->prev = rd->prev;
5094         }
5095 
5096         r = rd->prev;
5097         if (r != NULL) {
5098                 r->next = rd->next;
5099         } else {
5100                 qp->rdlist = rd->next;
5101         }
5102 
5103         cv_destroy(&rd->rdma_done_cv);
5104         kmem_free(rd, sizeof (*rd));
5105 }
5106 
5107 static void
5108 rdma_done_rem_list(rib_qp_t *qp)
5109 {
5110         struct rdma_done_list   *r, *n;
5111 
5112         mutex_enter(&qp->rdlist_lock);
5113         for (r = qp->rdlist; r != NULL; r = n) {
5114                 n = r->next;
5115                 rdma_done_rm(qp, r);
5116         }
5117         mutex_exit(&qp->rdlist_lock);
5118 }
5119 
5120 static void
5121 rdma_done_notify(rib_qp_t *qp, uint32_t xid)
5122 {
5123         struct rdma_done_list *r = qp->rdlist;
5124 
5125         ASSERT(MUTEX_HELD(&qp->rdlist_lock));
5126 
5127         while (r) {
5128                 if (r->xid == xid) {
5129                         cv_signal(&r->rdma_done_cv);
5130                         return;
5131                 } else {
5132                         r = r->next;
5133                 }
5134         }
5135         if (rib_debug > 1) {
5136             cmn_err(CE_WARN, "rdma_done_notify: "
5137                 "No matching xid for %u, qp %p\n", xid, (void *)qp);
5138         }
5139 }
5140 
5141 rpcib_ats_t *
5142 get_ibd_entry(ib_gid_t *gid, ib_pkey_t pkey, rpcib_ibd_insts_t *ibds)
5143 {
5144         rpcib_ats_t             *atsp;
5145         int                     i;
5146 
5147         for (i = 0, atsp = ibds->rib_ats; i < ibds->rib_ibd_cnt; i++, atsp++) {
5148                 if (atsp->ras_port_gid.gid_prefix == gid->gid_prefix &&
5149                     atsp->ras_port_gid.gid_guid == gid->gid_guid &&
5150                     atsp->ras_pkey == pkey) {
5151                         return (atsp);
5152                 }
5153         }
5154         return (NULL);
5155 }
5156 
5157 int
5158 rib_get_ibd_insts_cb(dev_info_t *dip, void *arg)
5159 {
5160         rpcib_ibd_insts_t *ibds = (rpcib_ibd_insts_t *)arg;
5161         rpcib_ats_t     *atsp;
5162         ib_pkey_t       pkey;
5163         uint8_t         port;
5164         ib_guid_t       hca_guid;
5165         ib_gid_t        port_gid;
5166 
5167         if (i_ddi_devi_attached(dip) &&
5168             (strcmp(ddi_node_name(dip), "ibport") == 0) &&
5169             (strstr(ddi_get_name_addr(dip), "ipib") != NULL)) {
5170 
5171                 if (ibds->rib_ibd_cnt >= ibds->rib_ibd_alloc) {
5172                     rpcib_ats_t *tmp;
5173 
5174                     tmp = (rpcib_ats_t *)kmem_zalloc((ibds->rib_ibd_alloc +
5175                         N_IBD_INSTANCES) * sizeof (rpcib_ats_t), KM_SLEEP);
5176                     bcopy(ibds->rib_ats, tmp,
5177                         ibds->rib_ibd_alloc * sizeof (rpcib_ats_t));
5178                     kmem_free(ibds->rib_ats,
5179                         ibds->rib_ibd_alloc * sizeof (rpcib_ats_t));
5180                     ibds->rib_ats = tmp;
5181                     ibds->rib_ibd_alloc += N_IBD_INSTANCES;
5182                 }
5183                 if (((hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY,
5184                         dip, 0, "hca-guid", 0)) == 0) ||
5185                     ((port = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
5186                         0, "port-number", 0)) == 0) ||
5187                     (ibt_get_port_state_byguid(hca_guid, port,
5188                         &port_gid, NULL) != IBT_SUCCESS) ||
5189                     ((pkey = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
5190                         "port-pkey", IB_PKEY_INVALID_LIMITED)) <=
5191                         IB_PKEY_INVALID_FULL)) {
5192                     return (DDI_WALK_CONTINUE);
5193                 }
5194                 atsp = &ibds->rib_ats[ibds->rib_ibd_cnt];
5195                 atsp->ras_inst = ddi_get_instance(dip);
5196                 atsp->ras_pkey = pkey;
5197                 atsp->ras_port_gid = port_gid;
5198                 ibds->rib_ibd_cnt++;
5199         }
5200         return (DDI_WALK_CONTINUE);
5201 }
5202 
5203 void
5204 rib_get_ibd_insts(rpcib_ibd_insts_t *ibds)
5205 {
5206         ddi_walk_devs(ddi_root_node(), rib_get_ibd_insts_cb, ibds);
5207 }
5208 
5209 /*
5210  * Return ibd interfaces and ibd instances.
5211  */
5212 int
5213 get_ibd_ipaddr(rpcib_ibd_insts_t *ibds)
5214 {
5215         TIUSER                  *tiptr, *tiptr6;
5216         vnode_t                 *kvp, *kvp6;
5217         vnode_t                 *vp = NULL, *vp6 = NULL;
5218         struct strioctl         iocb;
5219         struct lifreq           lif_req;
5220         int                     k, ip_cnt;
5221         rpcib_ats_t             *atsp;
5222 
5223         if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP,
5224                 &kvp) == 0) {
5225             if (t_kopen((file_t *)NULL, kvp->v_rdev, FREAD|FWRITE,
5226                 &tiptr, CRED()) == 0) {
5227                 vp = tiptr->fp->f_vnode;
5228             } else {
5229                 VN_RELE(kvp);
5230             }
5231         }
5232 
5233         if (lookupname("/dev/udp6", UIO_SYSSPACE, FOLLOW, NULLVPP,
5234                 &kvp6) == 0) {
5235             if (t_kopen((file_t *)NULL, kvp6->v_rdev, FREAD|FWRITE,
5236                 &tiptr6, CRED()) == 0) {
5237                 vp6 = tiptr6->fp->f_vnode;
5238             } else {
5239                 VN_RELE(kvp6);
5240             }
5241         }
5242 
5243         if (vp == NULL && vp6 == NULL)
5244                 return (-1);
5245 
5246         /* Get ibd ip's */
5247         ip_cnt = 0;
5248         for (k = 0, atsp = ibds->rib_ats; k < ibds->rib_ibd_cnt; k++, atsp++) {
5249                 /* IPv4 */
5250             if (vp != NULL) {
5251                 (void) bzero((void *)&lif_req, sizeof (struct lifreq));
5252                 (void) snprintf(lif_req.lifr_name,
5253                         sizeof (lif_req.lifr_name), "%s%d",
5254                         IBD_NAME, atsp->ras_inst);
5255 
5256                 (void) bzero((void *)&iocb, sizeof (struct strioctl));
5257                 iocb.ic_cmd = SIOCGLIFADDR;
5258                 iocb.ic_timout = 0;
5259                 iocb.ic_len = sizeof (struct lifreq);
5260                 iocb.ic_dp = (caddr_t)&lif_req;
5261                 if (kstr_ioctl(vp, I_STR, (intptr_t)&iocb) == 0) {
5262                     atsp->ras_inet_type = AF_INET;
5263                     bcopy(&lif_req.lifr_addr, &atsp->ras_sin,
5264                         sizeof (struct sockaddr_in));
5265                     ip_cnt++;
5266                     continue;
5267                 }
5268             }
5269                 /* Try IPv6 */
5270             if (vp6 != NULL) {
5271                 (void) bzero((void *)&lif_req, sizeof (struct lifreq));
5272                 (void) snprintf(lif_req.lifr_name,
5273                         sizeof (lif_req.lifr_name), "%s%d",
5274                         IBD_NAME, atsp->ras_inst);
5275 
5276                 (void) bzero((void *)&iocb, sizeof (struct strioctl));
5277                 iocb.ic_cmd = SIOCGLIFADDR;
5278                 iocb.ic_timout = 0;
5279                 iocb.ic_len = sizeof (struct lifreq);
5280                 iocb.ic_dp = (caddr_t)&lif_req;
5281                 if (kstr_ioctl(vp6, I_STR, (intptr_t)&iocb) == 0) {
5282 
5283                     atsp->ras_inet_type = AF_INET6;
5284                     bcopy(&lif_req.lifr_addr, &atsp->ras_sin6,
5285                             sizeof (struct sockaddr_in6));
5286                     ip_cnt++;
5287                 }
5288             }
5289         }
5290 
5291         if (vp6 != NULL) {
5292             (void) t_kclose(tiptr6, 0);
5293             VN_RELE(kvp6);
5294         }
5295         if (vp != NULL) {
5296             (void) t_kclose(tiptr, 0);
5297             VN_RELE(kvp);
5298         }
5299 
5300         if (ip_cnt == 0)
5301             return (-1);
5302         else
5303             return (0);
5304 }
5305 
5306 char **
5307 get_ip_addrs(int *count)
5308 {
5309         TIUSER                  *tiptr;
5310         vnode_t                 *kvp;
5311         int                     num_of_ifs;
5312         char                    **addresses;
5313         int                     return_code;
5314 
5315         /*
5316          * Open a device for doing down stream kernel ioctls
5317          */
5318         return_code = lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW,
5319             NULLVPP, &kvp);
5320         if (return_code != 0) {
5321                 cmn_err(CE_NOTE, "get_Ip_addrs: lookupname failed\n");
5322                 *count = -1;
5323                 return (NULL);
5324         }
5325 
5326         return_code = t_kopen((file_t *)NULL, kvp->v_rdev, FREAD|FWRITE,
5327             &tiptr, CRED());
5328         if (return_code != 0) {
5329                 cmn_err(CE_NOTE, "get_Ip_addrs: t_kopen failed\n");
5330                 VN_RELE(kvp);
5331                 *count = -1;
5332                 return (NULL);
5333         }
5334 
5335         /*
5336          * Perform the first ioctl to get the number of interfaces
5337          */
5338         return_code = get_interfaces(tiptr, &num_of_ifs);
5339         if (return_code != 0 || num_of_ifs == 0) {
5340                 cmn_err(CE_NOTE, "get_Ip_addrs: get_interfaces failed\n");
5341                 (void) t_kclose(tiptr, 0);
5342                 VN_RELE(kvp);
5343                 *count = -1;
5344                 return (NULL);
5345         }
5346 
5347         /*
5348          * Perform the second ioctl to get the address on each interface
5349          * found.
5350          */
5351         addresses = kmem_zalloc(num_of_ifs * sizeof (char *), KM_SLEEP);
5352         return_code = find_addrs(tiptr, addresses, num_of_ifs);
5353         if (return_code <= 0) {
5354                 cmn_err(CE_NOTE, "get_Ip_addrs: find_addrs failed\n");
5355                 (void) t_kclose(tiptr, 0);
5356                 kmem_free(addresses, num_of_ifs * sizeof (char *));
5357                 VN_RELE(kvp);
5358                 *count = -1;
5359                 return (NULL);
5360         }
5361 
5362         *count = return_code;
5363         VN_RELE(kvp);
5364         (void) t_kclose(tiptr, 0);
5365         return (addresses);
5366 }
5367 
5368 int
5369 get_interfaces(TIUSER *tiptr, int *num)
5370 {
5371         struct lifnum           if_buf;
5372         struct strioctl         iocb;
5373         vnode_t                 *vp;
5374         int                     return_code;
5375 
5376         /*
5377          * Prep the number of interfaces request buffer for ioctl
5378          */
5379         (void) bzero((void *)&if_buf, sizeof (struct lifnum));
5380         if_buf.lifn_family = AF_UNSPEC;
5381         if_buf.lifn_flags = 0;
5382 
5383         /*
5384          * Prep the kernel ioctl buffer and send it down stream
5385          */
5386         (void) bzero((void *)&iocb, sizeof (struct strioctl));
5387         iocb.ic_cmd = SIOCGLIFNUM;
5388         iocb.ic_timout = 0;
5389         iocb.ic_len = sizeof (if_buf);
5390         iocb.ic_dp = (caddr_t)&if_buf;
5391 
5392         vp = tiptr->fp->f_vnode;
5393         return_code = kstr_ioctl(vp, I_STR, (intptr_t)&iocb);
5394         if (return_code != 0) {
5395                 cmn_err(CE_NOTE, "get_interfaces: kstr_ioctl failed\n");
5396                 *num = -1;
5397                 return (-1);
5398         }
5399 
5400         *num = if_buf.lifn_count;
5401 #ifdef  DEBUG
5402         if (rib_debug > 1)
5403                 cmn_err(CE_NOTE, "Number of interfaces detected: %d\n",
5404                     if_buf.lifn_count);
5405 #endif
5406         return (0);
5407 }
5408 
5409 int
5410 find_addrs(TIUSER *tiptr, char **addrs, int num_ifs)
5411 {
5412         struct lifconf          lifc;
5413         struct lifreq           *if_data_buf;
5414         struct strioctl         iocb;
5415         caddr_t                 request_buffer;
5416         struct sockaddr_in      *sin4;
5417         struct sockaddr_in6     *sin6;
5418         vnode_t                 *vp;
5419         int                     i, count, return_code;
5420 
5421         /*
5422          * Prep the buffer for requesting all interface's info
5423          */
5424         (void) bzero((void *)&lifc, sizeof (struct lifconf));
5425         lifc.lifc_family = AF_UNSPEC;
5426         lifc.lifc_flags = 0;
5427         lifc.lifc_len = num_ifs * sizeof (struct lifreq);
5428 
5429         request_buffer = kmem_zalloc(num_ifs * sizeof (struct lifreq),
5430             KM_SLEEP);
5431 
5432         lifc.lifc_buf = request_buffer;
5433 
5434         /*
5435          * Prep the kernel ioctl buffer and send it down stream
5436          */
5437         (void) bzero((void *)&iocb, sizeof (struct strioctl));
5438         iocb.ic_cmd = SIOCGLIFCONF;
5439         iocb.ic_timout = 0;
5440         iocb.ic_len = sizeof (struct lifconf);
5441         iocb.ic_dp = (caddr_t)&lifc;
5442 
5443         vp = tiptr->fp->f_vnode;
5444         return_code = kstr_ioctl(vp, I_STR, (intptr_t)&iocb);
5445         if (return_code != 0) {
5446                 cmn_err(CE_NOTE, "find_addrs: kstr_ioctl failed\n");
5447                 kmem_free(request_buffer, num_ifs * sizeof (struct lifreq));
5448                 return (-1);
5449         }
5450 
5451         /*
5452          * Extract addresses and fill them in the requested array
5453          * IB_SVC_NAME_LEN is defined to be 64 so it  covers both IPv4 &
5454          * IPv6. Here count is the number of IP addresses collected.
5455          */
5456         if_data_buf = lifc.lifc_req;
5457         count = 0;
5458         for (i = lifc.lifc_len / sizeof (struct lifreq); i > 0; i--,
5459         if_data_buf++) {
5460                 if (if_data_buf->lifr_addr.ss_family == AF_INET) {
5461                         sin4 = (struct sockaddr_in *)&if_data_buf->lifr_addr;
5462                         addrs[count] = kmem_zalloc(IB_SVC_NAME_LEN, KM_SLEEP);
5463                         (void) inet_ntop(AF_INET, &sin4->sin_addr,
5464                             addrs[count], IB_SVC_NAME_LEN);
5465                         count ++;
5466                 }
5467 
5468                 if (if_data_buf->lifr_addr.ss_family == AF_INET6) {
5469                         sin6 = (struct sockaddr_in6 *)&if_data_buf->lifr_addr;
5470                         addrs[count] = kmem_zalloc(IB_SVC_NAME_LEN, KM_SLEEP);
5471                         (void) inet_ntop(AF_INET6, &sin6->sin6_addr,
5472                             addrs[count], IB_SVC_NAME_LEN);
5473                         count ++;
5474                 }
5475         }
5476 
5477         kmem_free(request_buffer, num_ifs * sizeof (struct lifreq));
5478         return (count);
5479 }
5480 
5481 /*
5482  * Goes through all connections and closes the channel
5483  * This will cause all the WRs on those channels to be
5484  * flushed.
5485  */
5486 static void
5487 rib_close_channels(rib_conn_list_t *connlist)
5488 {
5489         CONN            *conn;
5490         rib_qp_t        *qp;
5491 
5492         rw_enter(&connlist->conn_lock, RW_READER);
5493         conn = connlist->conn_hd;
5494         while (conn != NULL) {
5495                 mutex_enter(&conn->c_lock);
5496                 qp = ctoqp(conn);
5497                 if (conn->c_state & C_CONNECTED) {
5498                         /*
5499                          * Live connection in CONNECTED state.
5500                          * Call ibt_close_rc_channel in nonblocking mode
5501                          * with no callbacks.
5502                          */
5503                         conn->c_state = C_ERROR;
5504                         (void) ibt_close_rc_channel(qp->qp_hdl,
5505                                 IBT_NOCALLBACKS, NULL, 0, NULL, NULL, 0);
5506                         (void) ibt_free_channel(qp->qp_hdl);
5507                         qp->qp_hdl = NULL;
5508                 } else {
5509                         if (conn->c_state == C_ERROR &&
5510                                 qp->qp_hdl != NULL) {
5511                                 /*
5512                                  * Connection in ERROR state but
5513                                  * channel is not yet freed.
5514                                  */
5515                                 (void) ibt_close_rc_channel(qp->qp_hdl,
5516                                         IBT_NOCALLBACKS, NULL, 0, NULL,
5517                                         NULL, 0);
5518                                 (void) ibt_free_channel(qp->qp_hdl);
5519                                 qp->qp_hdl = NULL;
5520                         }
5521                 }
5522                 mutex_exit(&conn->c_lock);
5523                 conn = conn->c_next;
5524         }
5525         rw_exit(&connlist->conn_lock);
5526 }
5527 
5528 /*
5529  * Frees up all connections that are no longer being referenced
5530  */
5531 static void
5532 rib_purge_connlist(rib_conn_list_t *connlist)
5533 {
5534         CONN            *conn;
5535 
5536 top:
5537         rw_enter(&connlist->conn_lock, RW_READER);
5538         conn = connlist->conn_hd;
5539         while (conn != NULL) {
5540                 mutex_enter(&conn->c_lock);
5541 
5542                 /*
5543                  * At this point connection is either in ERROR
5544                  * or DISCONN_PEND state. If in DISCONN_PEND state
5545                  * then some other thread is culling that connection.
5546                  * If not and if c_ref is 0, then destroy the connection.
5547                  */
5548                 if (conn->c_ref == 0 &&
5549                         conn->c_state != C_DISCONN_PEND) {
5550                         /*
5551                          * Cull the connection
5552                          */
5553                         conn->c_state = C_DISCONN_PEND;
5554                         mutex_exit(&conn->c_lock);
5555                         rw_exit(&connlist->conn_lock);
5556                         (void) rib_disconnect_channel(conn, connlist);
5557                         goto top;
5558                 } else {
5559                         /*
5560                          * conn disconnect already scheduled or will
5561                          * happen from conn_release when c_ref drops to 0.
5562                          */
5563                         mutex_exit(&conn->c_lock);
5564                 }
5565                 conn = conn->c_next;
5566         }
5567         rw_exit(&connlist->conn_lock);
5568 
5569         /*
5570          * At this point, only connections with c_ref != 0 are on the list
5571          */
5572 }
5573 
5574 /*
5575  * Cleans and closes up all uses of the HCA
5576  */
5577 static void
5578 rib_detach_hca(rib_hca_t *hca)
5579 {
5580 
5581         /*
5582          * Stop all services on the HCA
5583          * Go through cl_conn_list and close all rc_channels
5584          * Go through svr_conn_list and close all rc_channels
5585          * Free connections whose c_ref has dropped to 0
5586          * Destroy all CQs
5587          * Deregister and released all buffer pool memory after all
5588          * connections are destroyed
5589          * Free the protection domain
5590          * ibt_close_hca()
5591          */
5592         rw_enter(&hca->state_lock, RW_WRITER);
5593         if (hca->state == HCA_DETACHED) {
5594                 rw_exit(&hca->state_lock);
5595                 return;
5596         }
5597 
5598         hca->state = HCA_DETACHED;
5599         rib_stat->nhca_inited--;
5600 
5601         rib_stop_services(hca);
5602         rib_deregister_ats();
5603         rib_close_channels(&hca->cl_conn_list);
5604         rib_close_channels(&hca->srv_conn_list);
5605         rw_exit(&hca->state_lock);
5606 
5607         rib_purge_connlist(&hca->cl_conn_list);
5608         rib_purge_connlist(&hca->srv_conn_list);
5609 
5610         (void) ibt_free_cq(hca->clnt_rcq->rib_cq_hdl);
5611         (void) ibt_free_cq(hca->clnt_scq->rib_cq_hdl);
5612         (void) ibt_free_cq(hca->svc_rcq->rib_cq_hdl);
5613         (void) ibt_free_cq(hca->svc_scq->rib_cq_hdl);
5614         kmem_free(hca->clnt_rcq, sizeof (rib_cq_t));
5615         kmem_free(hca->clnt_scq, sizeof (rib_cq_t));
5616         kmem_free(hca->svc_rcq, sizeof (rib_cq_t));
5617         kmem_free(hca->svc_scq, sizeof (rib_cq_t));
5618 
5619         rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
5620         rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
5621         if (hca->srv_conn_list.conn_hd == NULL &&
5622                 hca->cl_conn_list.conn_hd == NULL) {
5623                 /*
5624                  * conn_lists are NULL, so destroy
5625                  * buffers, close hca and be done.
5626                  */
5627                 rib_rbufpool_destroy(hca, RECV_BUFFER);
5628                 rib_rbufpool_destroy(hca, SEND_BUFFER);
5629 #ifdef SERVER_REG_CACHE
5630                 rib_destroy_cache(hca);
5631 #endif
5632                 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
5633                 (void) ibt_close_hca(hca->hca_hdl);
5634                 hca->hca_hdl = NULL;
5635         }
5636         rw_exit(&hca->cl_conn_list.conn_lock);
5637         rw_exit(&hca->srv_conn_list.conn_lock);
5638 
5639         if (hca->hca_hdl != NULL) {
5640                 mutex_enter(&hca->inuse_lock);
5641                 while (hca->inuse)
5642                         cv_wait(&hca->cb_cv, &hca->inuse_lock);
5643                 mutex_exit(&hca->inuse_lock);
5644                 /*
5645                  * conn_lists are now NULL, so destroy
5646                  * buffers, close hca and be done.
5647                  */
5648                 rib_rbufpool_destroy(hca, RECV_BUFFER);
5649                 rib_rbufpool_destroy(hca, SEND_BUFFER);
5650                 (void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
5651                 (void) ibt_close_hca(hca->hca_hdl);
5652                 hca->hca_hdl = NULL;
5653         }
5654 }
5655 
5656 #ifdef SERVER_REG_CACHE
5657 
5658 static void
5659 rib_server_side_cache_reclaim(void *argp)
5660 {
5661 cache_avl_struct_t    *rcas;
5662 rib_lrc_entry_t      *rb;
5663 rib_hca_t *hca = (rib_hca_t *)argp;
5664 
5665 rw_enter(&hca->avl_rw_lock,RW_WRITER);
5666 rcas = avl_first(&hca->avl_tree);
5667 if(rcas != NULL)
5668                 avl_remove(&hca->avl_tree, rcas);
5669 while(rcas != NULL){
5670         while(rcas->r.forw != &rcas->r){
5671                 rcas->elements--;
5672                 rb = rcas->r.forw;
5673                 remque(rb);
5674                 rib_deregistermem_via_hca(hca, rb->lrc_buf, rb->lrc_mhandle);
5675                 kmem_free(rb->lrc_buf, rb->lrc_len);
5676                 kmem_free(rb, sizeof(rib_lrc_entry_t));
5677         }
5678         mutex_destroy(&rcas->node_lock);
5679         kmem_cache_free(hca->server_side_cache,rcas);
5680         rcas = avl_first(&hca->avl_tree);
5681         if(rcas != NULL)
5682                 avl_remove(&hca->avl_tree, rcas);
5683         }
5684 rw_exit(&hca->avl_rw_lock);
5685 }
5686 
5687 static int avl_compare(const void *t1,const void *t2) {
5688 
5689 if(rib_debug > 1)
5690 cmn_err(CE_NOTE,"Comparing %d and %d\n",((cache_avl_struct_t *)t1)->len, ((cache_avl_struct_t *)t2)->len);
5691 if(((cache_avl_struct_t *)t1)->len == ((cache_avl_struct_t *)t2)->len) 
5692         return 0;
5693 
5694 if(((cache_avl_struct_t *)t1)->len < ((cache_avl_struct_t *)t2)->len) 
5695         return -1;
5696 
5697 if(((cache_avl_struct_t *)t1)->len > ((cache_avl_struct_t *)t2)->len) 
5698         return  1;
5699 }
5700 
5701 static void rib_destroy_cache(rib_hca_t *hca) {
5702 cache_avl_struct_t    *rcas, *root;
5703  rib_lrc_entry_t      *rb;
5704 
5705 hca->avl_init = FALSE;
5706 kmem_cache_destroy(hca->server_side_cache);
5707 avl_destroy(&hca->avl_tree);
5708 rw_destroy(&hca->avl_rw_lock);
5709 
5710 }
5711 
5712  static rib_lrc_entry_t *
5713  rib_get_server_cache_buf(CONN *conn,uint32_t len)
5714  {
5715          cache_avl_struct_t    cas,*rcas;
5716          rib_hca_t       *hca = (ctoqp(conn))->hca;
5717          rib_lrc_entry_t *reply_buf;
5718          avl_index_t where = NULL;
5719          struct rib_lrc_entry *forw = NULL;
5720         if(!hca->avl_init)
5721                 goto  error_alloc;
5722          cas.len = len;
5723          rw_enter(&hca->avl_rw_lock, RW_READER);
5724          if((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, &cas, &where)) == NULL){
5725          rw_exit(&hca->avl_rw_lock);
5726          rw_enter(&hca->avl_rw_lock, RW_WRITER);
5727          /* Recheck to make sure no other thread added the entry in */
5728          if((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, &cas, &where)) == NULL){      
5729          /* Allocate an avl tree entry */
5730                 if(rib_debug > 1)
5731                 cmn_err(CE_NOTE,"Allocating an avl entry for length %d\n",len);
5732                 rcas        = (cache_avl_struct_t *)kmem_cache_alloc(hca->server_side_cache,KM_SLEEP);
5733                 bzero(rcas, sizeof(cache_avl_struct_t));
5734                 rcas->elements = 0;
5735                 rcas->r.forw =
5736                         &rcas->r;
5737                 rcas->r.back =
5738                         &rcas->r;
5739                 rcas->len           = len;
5740                 mutex_init(&rcas->node_lock, NULL, MUTEX_DEFAULT, NULL);
5741                 avl_insert(&hca->avl_tree,rcas,where);
5742         }
5743         }  
5744         if(rcas->elements > 0){
5745                         mutex_enter(&rcas->node_lock);
5746                         reply_buf = rcas->r.forw;
5747                         remque(reply_buf);
5748                         rcas->elements --;
5749                         mutex_exit(&rcas->node_lock);
5750                         rw_exit(&hca->avl_rw_lock);
5751                         if(rib_debug > 1)
5752                         cmn_err(CE_NOTE,"Allocating a pre-alloced  buffer for length %d\n",len);
5753                 } else {
5754                         rw_exit(&hca->avl_rw_lock);
5755                         rib_total_buffers ++;
5756                         if(rib_debug > 1)
5757                         cmn_err(CE_NOTE,"Allocating a new  buffer for length %d\n",len);
5758                         /* Allocate a reply_buf entry */
5759                         reply_buf           = (rib_lrc_entry_t *)kmem_alloc(sizeof(rib_lrc_entry_t), KM_SLEEP);
5760                         bzero(reply_buf,sizeof(rib_lrc_entry_t));
5761                         reply_buf->lrc_buf  = kmem_alloc(len, KM_SLEEP);
5762                         reply_buf->lrc_len  = len;
5763                         reply_buf->registered = FALSE;
5764                         reply_buf->avl_node = (void *)rcas;
5765                         }
5766         
5767         return reply_buf;       
5768         error_alloc:
5769         reply_buf           = (rib_lrc_entry_t *)kmem_alloc(sizeof(rib_lrc_entry_t), KM_SLEEP);
5770         bzero(reply_buf,sizeof(rib_lrc_entry_t));
5771         reply_buf->lrc_buf  = kmem_alloc(len, KM_SLEEP);
5772         reply_buf->lrc_len  = len;
5773         reply_buf->registered = FALSE;
5774         reply_buf->avl_node = NULL;
5775         return reply_buf;       
5776 }
5777 
5778  /*
5779   * Return a pre-registered back to the cache (without
5780   * unregistering the buffer)..
5781   */
5782 
5783 static void
5784 rib_free_server_cache_buf(CONN *conn, rib_lrc_entry_t *reg_buf)
5785 {
5786          cache_avl_struct_t    cas,*rcas;
5787          avl_index_t where = NULL;
5788          rib_hca_t       *hca = (ctoqp(conn))->hca;
5789         if(!reg_buf){
5790                 cmn_err(CE_WARN,"Got a null reg_buf\n");
5791                 return;
5792          }
5793          if(!hca->avl_init)
5794                 goto  error_free;
5795          cas.len = reg_buf->lrc_len;
5796          rw_enter(&hca->avl_rw_lock, RW_READER);
5797          if((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree,&cas,&where)) == NULL){
5798                 rw_exit(&hca->avl_rw_lock);
5799                 goto error_free;
5800          } else {
5801          mutex_enter(&rcas->node_lock); 
5802          insque(reg_buf,&rcas->r);
5803          rcas->elements ++;
5804          mutex_exit(&rcas->node_lock);
5805          rw_exit(&hca->avl_rw_lock);
5806          if(rib_debug > 1)
5807          cmn_err(CE_NOTE,"Returning buffer for length %d\n",reg_buf->lrc_len);
5808          } 
5809         return;
5810         error_free:     
5811         rib_deregistermem_via_hca(hca, reg_buf->lrc_buf, reg_buf->lrc_mhandle);
5812         kmem_free(reg_buf->lrc_buf,reg_buf->lrc_len);
5813         kmem_free(reg_buf,sizeof(rib_lrc_entry_t));
5814 }
5815 
5816 #endif
5817 
5818 static rdma_stat
5819  rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, caddr_t buf,
5820          uint_t buflen, struct mrc *buf_handle)
5821  {
5822          ibt_mr_hdl_t    mr_hdl = NULL;  /* memory region handle */
5823 #ifdef IB_FMR_SUP
5824         ibt_pmr_desc_t  pmr_desc;       /* vaddr, lkey, rkey */
5825         ibt_ma_hdl_t    ma_hdl = NULL;
5826 #endif
5827          ibt_mr_desc_t   mr_desc;        /* vaddr, lkey, rkey */
5828          rdma_stat       status;
5829 
5830 
5831          /*
5832           * Note: ALL buffer pools use the same memory type RDMARW.
5833          */
5834         /* This code will not be activated on the server. We could remove
5835            the call to rib_reg_mem_fmr. But leave it in, in case the FMR
5836            bugs get fixed. The bigger question is whether we need FMR when
5837            the registered bufffers are coming out of a slab cache. This needs
5838            to be evaluated. 
5839         */
5840 #ifdef IB_FMR_SUP
5841         status = rib_reg_mem_fmr(hca, buf, adsp, buflen, 0, &mr_hdl, &ma_hdl,
5842             &pmr_desc);
5843         if (status == RDMA_SUCCESS) {
5844                 buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
5845                 buf_handle->mrc_lmr = (uint32_t)pmr_desc.pmd_lkey;
5846                 buf_handle->mrc_rmr = (uint32_t)pmr_desc.pmd_rkey;
5847                 buf_handle->mrc_lma = (uintptr_t)ma_hdl;
5848                 goto ret_stat;
5849         } else {
5850                 buf_handle->mrc_linfo = NULL;
5851                 buf_handle->mrc_lma = NULL;
5852                 buf_handle->mrc_lmr = 0;
5853                 buf_handle->mrc_rmr = 0;
5854         }
5855 #endif
5856          status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
5857          if (status == RDMA_SUCCESS) {
5858                  buf_handle->mrc_linfo = (uint64_t)mr_hdl;
5859                  buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
5860                  buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
5861          } else {
5862                  buf_handle->mrc_linfo = NULL;
5863                  buf_handle->mrc_lmr = 0;
5864                  buf_handle->mrc_rmr = 0;
5865          }
5866         ret_stat:
5867          return (status);
5868 }
5869 
5870 /* ARGSUSED */
5871 static rdma_stat
5872 rib_deregistermemsync_via_hca(rib_hca_t *hca, caddr_t buf,
5873          struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle)
5874  {
5875 
5876          (void) rib_deregistermem_via_hca(hca, buf, buf_handle);
5877 
5878          return (RDMA_SUCCESS);
5879  }
5880 
5881 /* ARGSUSED */
5882  static rdma_stat
5883 rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, struct mrc buf_handle)
5884 {
5885 #ifdef IB_FMR_SUP
5886         ibt_status_t    ibt_status;
5887         if(buf_handle.mrc_lma){
5888         ibt_status = ibt_unmap_mem_area(hca->hca_hdl,
5889             (ibt_ma_hdl_t)buf_handle.mrc_lma);
5890         if (ibt_status != IBT_SUCCESS){
5891                 cmn_err(CE_WARN,"rib_deregistermem: ibt_unmap_mem_area: %d failed",
5892                     ibt_status);
5893          return (RDMA_FAILED);
5894         }
5895         ibt_status = ibt_deregister_fmr(hca->hca_hdl,
5896             (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
5897         if (ibt_status != IBT_SUCCESS){
5898                 cmn_err(CE_WARN,"rib_deregistermem: ibt_unmap_mem_area: %d failed",
5899                     ibt_status);
5900          return (RDMA_FAILED);
5901         }
5902          return (RDMA_SUCCESS);
5903         }
5904 #endif
5905 
5906          (void) ibt_deregister_mr(hca->hca_hdl,
5907                          (ibt_mr_hdl_t)buf_handle.mrc_linfo);
5908          return (RDMA_SUCCESS);
5909 }
5910 
5911 #if defined(ASYNC_SERVER_DEREG)||defined(ASYNC_CLIENT_DEREG)
5912 static int
5913 clist_deregister1(CONN *conn, struct clist *cl, bool_t src)
5914 {
5915         struct clist *c;
5916 
5917         for (c = cl; c; c = c->c_next) {
5918                 if (src) {
5919                         if (c->c_smemhandle.mrc_rmr != 0) {
5920                                 (void) RDMA_DEREGMEMSYNC(conn,
5921                                     (caddr_t)(uintptr_t)c->c_saddr,
5922                                     c->c_smemhandle,
5923 #ifdef SERVER_REG_CACHE
5924                                     (void *)(uintptr_t)c->c_ssynchandle, (void *)c->long_reply_buf);
5925 #else
5926                                     (void *)(uintptr_t)c->c_ssynchandle);
5927 #endif
5928                                 c->c_smemhandle.mrc_rmr = 0;
5929                                 c->c_ssynchandle = NULL;
5930                         }
5931                 } else {
5932                         if (c->c_dmemhandle.mrc_rmr != 0) {
5933                                 (void) RDMA_DEREGMEMSYNC(conn,
5934                                     (caddr_t)(uintptr_t)c->c_daddr,
5935                                     c->c_dmemhandle,
5936 #ifdef SERVER_REG_CACHE
5937                                     (void *)(uintptr_t)c->c_dsynchandle, (void *)c->long_reply_buf);
5938 #else
5939                                     (void *)(uintptr_t)c->c_dsynchandle);
5940 #endif
5941                                 c->c_dmemhandle.mrc_rmr = 0;
5942                                 c->c_dsynchandle = NULL;
5943                         }
5944                 }
5945         }
5946 
5947         return (RDMA_SUCCESS);
5948 }
5949 #endif
5950 
5951 
5952 
5953 #if defined(ASYNC_CLIENT_DEREG)
5954 static void
5955 async_dereg_thread(caddr_t arg){ 
5956         ASYNC *r;
5957         cmn_err(CE_WARN,"async_dereg_thread initiated\n");
5958         fetch_another_entry:
5959         mutex_enter(&at_mutex);
5960         while ((rqueue.forw == rqueue.back) && (rqueue.forw == &rqueue))
5961         cv_wait(&at_cond, &at_mutex);
5962         r=rqueue.forw;  
5963         remque(rqueue.forw);    
5964         mutex_exit(&at_mutex);
5965         /* Process deregistration */
5966         clist_deregister1(&r->c_conn, &r->c_clist, FALSE);
5967         kmem_free(r, sizeof(ASYNC));
5968         goto fetch_another_entry;
5969 
5970 } 
5971 void insert_queue(CONN  *conn, struct clist  *rwc){ 
5972 ASYNC *r;
5973                         r=kmem_zalloc(sizeof(ASYNC),KM_SLEEP);  
5974                         r->c_clist = *rwc;
5975                         r->c_conn =  *conn; 
5976                         mutex_enter(&at_mutex);
5977                         insque(r,&rqueue);
5978                         cv_broadcast(&at_cond);
5979                         mutex_exit(&at_mutex);      
5980 }
5981 #endif
--- EOF ---