Old clnt_rdma.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License"). You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22 /*
23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
27 /* All Rights Reserved */
28 /*
29 * Portions of this source code were derived from Berkeley
30 * 4.3 BSD under license from the Regents of the University of
31 * California.
32 */
33
34 #pragma ident "@(#)clnt_rdma.c 1.10 05/07/26 SMI"
35
36 #include <sys/param.h>
37 #include <sys/types.h>
38 #include <sys/user.h>
39 #include <sys/systm.h>
40 #include <sys/sysmacros.h>
41 #include <sys/errno.h>
42 #include <sys/kmem.h>
43 #include <sys/debug.h>
44 #include <sys/systm.h>
45 #include <sys/kstat.h>
46 #include <sys/t_lock.h>
47 #include <sys/ddi.h>
48 #include <sys/cmn_err.h>
49 #include <sys/time.h>
50 #include <sys/isa_defs.h>
51 #include <sys/zone.h>
52
53 #include <rpc/types.h>
54 #include <rpc/xdr.h>
55 #include <rpc/auth.h>
56 #include <rpc/clnt.h>
57 #include <rpc/rpc_msg.h>
58 #include <rpc/rpc_rdma.h>
59
60
61 static enum clnt_stat clnt_rdma_kcallit(CLIENT *, rpcproc_t, xdrproc_t,
62 caddr_t, xdrproc_t, caddr_t, struct timeval);
63 static void clnt_rdma_kabort(CLIENT *);
64 static void clnt_rdma_kerror(CLIENT *, struct rpc_err *);
65 static bool_t clnt_rdma_kfreeres(CLIENT *, xdrproc_t, caddr_t);
66 static void clnt_rdma_kdestroy(CLIENT *);
67 static bool_t clnt_rdma_kcontrol(CLIENT *, int, char *);
68 static int clnt_rdma_ksettimers(CLIENT *, struct rpc_timers *,
69 struct rpc_timers *, int, void(*)(int, int, caddr_t), caddr_t, uint32_t);
70
71 /*
72 * Operations vector for RDMA based RPC
73 */
74 static struct clnt_ops rdma_clnt_ops = {
75 clnt_rdma_kcallit, /* do rpc call */
76 clnt_rdma_kabort, /* abort call */
77 clnt_rdma_kerror, /* return error status */
78 clnt_rdma_kfreeres, /* free results */
79 clnt_rdma_kdestroy, /* destroy rpc handle */
80 clnt_rdma_kcontrol, /* the ioctl() of rpc */
81 clnt_rdma_ksettimers, /* set retry timers */
82 };
83
84 /*
85 * The size of the preserialized RPC header information.
86 */
87 #define CKU_HDRSIZE 20
88
89 /*
90 * Per RPC RDMA endpoint details
91 */
92 typedef struct cku_private {
93 CLIENT cku_client; /* client handle */
94 rdma_mod_t *cku_rd_mod; /* underlying RDMA mod */
95 void *cku_rd_handle; /* underlying RDMA device */
96 struct netbuf cku_addr; /* remote netbuf address */
97 int cku_addrfmly; /* for finding addr_type */
98 struct rpc_err cku_err; /* error status */
99 struct cred *cku_cred; /* credentials */
100 XDR cku_outxdr; /* xdr stream for output */
101 uint32_t cku_outsz;
102 XDR cku_inxdr; /* xdr stream for input */
103 char cku_rpchdr[CKU_HDRSIZE+4]; /* rpc header */
104 uint32_t cku_xid; /* current XID */
105 } cku_private_t;
106
107 #define CLNT_RDMA_DELAY 10 /* secs to delay after a connection failure */
108 static int clnt_rdma_min_delay = CLNT_RDMA_DELAY;
109
110 struct {
111 kstat_named_t rccalls;
112 kstat_named_t rcbadcalls;
113 kstat_named_t rcbadxids;
114 kstat_named_t rctimeouts;
115 kstat_named_t rcnewcreds;
116 kstat_named_t rcbadverfs;
117 kstat_named_t rctimers;
118 kstat_named_t rccantconn;
119 kstat_named_t rcnomem;
120 kstat_named_t rcintrs;
121 kstat_named_t rclongrpcs;
122 } rdmarcstat = {
123 { "calls", KSTAT_DATA_UINT64 },
124 { "badcalls", KSTAT_DATA_UINT64 },
125 { "badxids", KSTAT_DATA_UINT64 },
126 { "timeouts", KSTAT_DATA_UINT64 },
127 { "newcreds", KSTAT_DATA_UINT64 },
128 { "badverfs", KSTAT_DATA_UINT64 },
129 { "timers", KSTAT_DATA_UINT64 },
130 { "cantconn", KSTAT_DATA_UINT64 },
131 { "nomem", KSTAT_DATA_UINT64 },
132 { "interrupts", KSTAT_DATA_UINT64 },
133 { "longrpc", KSTAT_DATA_UINT64 }
134 };
135
136 kstat_named_t *rdmarcstat_ptr = (kstat_named_t *)&rdmarcstat;
137 uint_t rdmarcstat_ndata = sizeof (rdmarcstat) / sizeof (kstat_named_t);
138
139 #ifdef DEBUG
140 int rdma_clnt_debug = 0;
141 #endif
142
143 #ifdef accurate_stats
144 extern kmutex_t rdmarcstat_lock; /* mutex for rcstat updates */
145
146 #define RCSTAT_INCR(x) \
147 mutex_enter(&rdmarcstat_lock); \
148 rdmarcstat.x.value.ui64++; \
149 mutex_exit(&rdmarcstat_lock);
150 #else
151 #define RCSTAT_INCR(x) \
152 rdmarcstat.x.value.ui64++;
153 #endif
154
155 #define ptoh(p) (&((p)->cku_client))
156 #define htop(h) ((cku_private_t *)((h)->cl_private))
157
158 int
159 clnt_rdma_kcreate(char *proto, void *handle, struct netbuf *raddr, int family,
160 rpcprog_t pgm, rpcvers_t vers, struct cred *cred, CLIENT **cl)
161 {
162 CLIENT *h;
163 struct cku_private *p;
164 struct rpc_msg call_msg;
165 rdma_registry_t *rp;
166
167 ASSERT(INGLOBALZONE(curproc));
168
169 if (cl == NULL)
170 return (EINVAL);
171 *cl = NULL;
172
173 p = kmem_zalloc(sizeof (*p), KM_SLEEP);
174
175 /*
176 * Find underlying RDMATF plugin
177 */
178 rw_enter(&rdma_lock, RW_READER);
179 rp = rdma_mod_head;
180 while (rp != NULL) {
181 if (strcmp(rp->r_mod->rdma_api, proto))
182 rp = rp->r_next;
183 else {
184 p->cku_rd_mod = rp->r_mod;
185 p->cku_rd_handle = handle;
186 break;
187 }
188 }
189 rw_exit(&rdma_lock);
190
191 if (p->cku_rd_mod == NULL) {
192 /*
193 * Should not happen.
194 * No matching RDMATF plugin.
195 */
196 kmem_free(p, sizeof (struct cku_private));
197 return (EINVAL);
198 }
199
200 h = ptoh(p);
201 h->cl_ops = &rdma_clnt_ops;
202 h->cl_private = (caddr_t)p;
203 h->cl_auth = authkern_create();
204
205 /* call message, just used to pre-serialize below */
206 call_msg.rm_xid = 0;
207 call_msg.rm_direction = CALL;
208 call_msg.rm_call.cb_rpcvers = RPC_MSG_VERSION;
209 call_msg.rm_call.cb_prog = pgm;
210 call_msg.rm_call.cb_vers = vers;
211
212 xdrmem_create(&p->cku_outxdr, p->cku_rpchdr, CKU_HDRSIZE, XDR_ENCODE);
213 /* pre-serialize call message header */
214 if (!xdr_callhdr(&p->cku_outxdr, &call_msg)) {
215 XDR_DESTROY(&p->cku_outxdr);
216 auth_destroy(h->cl_auth);
217 kmem_free(p, sizeof (struct cku_private));
218 return (EINVAL);
219 }
220
221 /*
222 * Set up the rpc information
223 */
224 p->cku_cred = cred;
225 p->cku_addr.buf = kmem_zalloc(raddr->maxlen, KM_SLEEP);
226 p->cku_addr.maxlen = raddr->maxlen;
227 p->cku_addr.len = raddr->len;
228 bcopy(raddr->buf, p->cku_addr.buf, raddr->len);
229 p->cku_addrfmly = family;
230
231 *cl = h;
232 return (0);
233 }
234
235 static void
236 clnt_rdma_kdestroy(CLIENT *h)
237 {
238 struct cku_private *p = htop(h);
239
240 kmem_free(p->cku_addr.buf, p->cku_addr.maxlen);
241 kmem_free(p, sizeof (*p));
242 }
243
244 void
245 clnt_rdma_kinit(CLIENT *h, char *proto, void *handle, struct netbuf *raddr,
246 struct cred *cred)
247 {
248 struct cku_private *p = htop(h);
249 rdma_registry_t *rp;
250
251 ASSERT(INGLOBALZONE(curproc));
252 /*
253 * Find underlying RDMATF plugin
254 */
255 p->cku_rd_mod = NULL;
256 rw_enter(&rdma_lock, RW_READER);
257 rp = rdma_mod_head;
258 while (rp != NULL) {
259 if (strcmp(rp->r_mod->rdma_api, proto))
260 rp = rp->r_next;
261 else {
262 p->cku_rd_mod = rp->r_mod;
263 p->cku_rd_handle = handle;
264 break;
265 }
266
267 }
268 rw_exit(&rdma_lock);
269
270 /*
271 * Set up the rpc information
272 */
273 p->cku_cred = cred;
274 p->cku_xid = 0;
275
276 if (p->cku_addr.maxlen < raddr->len) {
277 if (p->cku_addr.maxlen != 0 && p->cku_addr.buf != NULL)
278 kmem_free(p->cku_addr.buf, p->cku_addr.maxlen);
279 p->cku_addr.buf = kmem_zalloc(raddr->maxlen, KM_SLEEP);
280 p->cku_addr.maxlen = raddr->maxlen;
281 }
282
283 p->cku_addr.len = raddr->len;
284 bcopy(raddr->buf, p->cku_addr.buf, raddr->len);
285 h->cl_ops = &rdma_clnt_ops;
286 }
287
288 /* ARGSUSED */
289 static enum clnt_stat
290 clnt_rdma_kcallit(CLIENT *h, rpcproc_t procnum, xdrproc_t xdr_args,
291 caddr_t argsp, xdrproc_t xdr_results, caddr_t resultsp, struct timeval wait)
292 {
293 cku_private_t *p = htop(h);
294 int status;
295 XDR *xdrs;
296 XDR *cxdrp = NULL, callxdr; /* for xdrrdma encoding the RPC call */
297 XDR *rxdrp = NULL, replxdr; /* for xdrrdma decoding the RPC reply */
298 struct rpc_msg reply_msg;
299 struct clist *sendlist, *recvlist = NULL;
300 struct clist *cl = NULL, *cle = NULL;
301 uint_t vers, op;
302 uint_t off;
303 uint32_t xid;
304 CONN *conn = NULL;
305 rdma_buf_t clmsg, rpcmsg, longmsg, rpcreply;
306 int msglen;
307 clock_t ticks;
308
309 RCSTAT_INCR(rccalls);
310 /*
311 * Get unique xid
312 */
313 if (p->cku_xid == 0)
314 p->cku_xid = alloc_xid();
315
316 status = RDMA_GET_CONN(p->cku_rd_mod->rdma_ops, &p->cku_addr,
317 p->cku_addrfmly, p->cku_rd_handle, &conn);
318
319 if (conn == NULL) {
320 /*
321 * Connect failed to server. Could be because of one
322 * of several things. In some cases we don't want
323 * the caller to retry immediately - delay before
324 * returning to caller.
325 */
326 switch (status) {
327 case RDMA_TIMEDOUT:
328 /*
329 * Already timed out. No need to delay
330 * some more.
331 */
332 p->cku_err.re_status = RPC_TIMEDOUT;
333 p->cku_err.re_errno = ETIMEDOUT;
334 break;
335 case RDMA_INTR:
336 /*
337 * Failed because of an signal. Very likely
338 * the caller will not retry.
339 */
340 p->cku_err.re_status = RPC_INTR;
341 p->cku_err.re_errno = EINTR;
342 break;
343 default:
344 /*
345 * All other failures - server down or service
346 * down or temporary resource failure. Delay before
347 * returning to caller.
348 */
349 ticks = clnt_rdma_min_delay * drv_usectohz(1000000);
350 p->cku_err.re_status = RPC_CANTCONNECT;
351 p->cku_err.re_errno = EIO;
352
353 if (h->cl_nosignal == TRUE) {
354 delay(ticks);
355 } else {
356 if (delay_sig(ticks) == EINTR) {
357 p->cku_err.re_status = RPC_INTR;
358 p->cku_err.re_errno = EINTR;
359 }
360 }
361 break;
362 }
363
364 return (p->cku_err.re_status);
365 }
366 /*
367 * Get the size of the rpc call message. Need this
368 * to determine if the rpc call message will fit in
369 * the pre-allocated RDMA buffers. If the rpc call
370 * message length is greater that the pre-allocated
371 * buffers then, it is a Long RPC. A one time use
372 * buffer is allocated and registered for the Long
373 * RPC call.
374 */
375 xdrs = &callxdr;
376 msglen = CKU_HDRSIZE + BYTES_PER_XDR_UNIT;
377 if (h->cl_auth->ah_cred.oa_flavor != RPCSEC_GSS) {
378 msglen += xdrrdma_authsize(h->cl_auth, p->cku_cred,
379 rdma_minchunk);
380 msglen += xdrrdma_sizeof(xdr_args, argsp, rdma_minchunk);
381
382 if (msglen > RPC_MSG_SZ) {
383
384 /*
385 * Long RPC. Allocate one time use custom buffer.
386 */
387 rpcmsg.type = CHUNK_BUFFER;
388 rpcmsg.addr = kmem_zalloc(msglen, KM_SLEEP);
389 cle = kmem_zalloc(sizeof (*cle), KM_SLEEP);
390 cle->c_xdroff = 0;
391 cle->c_len = rpcmsg.len = msglen;
392 cle->c_saddr = (uint64)(uintptr_t)rpcmsg.addr;
393 cle->c_next = NULL;
394 xdrrdma_create(xdrs, rpcmsg.addr, msglen,
395 rdma_minchunk, cle, XDR_ENCODE, NULL);
396 cxdrp = xdrs;
397 op = RDMA_NOMSG;
398 } else {
399 /*
400 * Get a pre-allocated buffer for rpc call
401 */
402 rpcmsg.type = SEND_BUFFER;
403 if (RDMA_BUF_ALLOC(conn, &rpcmsg)) {
404 p->cku_err.re_status = RPC_CANTSEND;
405 p->cku_err.re_errno = EIO;
406 RCSTAT_INCR(rcnomem);
407 cmn_err(CE_WARN,
408 "clnt_rdma_kcallit: no buffers!");
409 goto done;
410 }
411 xdrrdma_create(xdrs, rpcmsg.addr, rpcmsg.len,
412 rdma_minchunk, NULL, XDR_ENCODE, NULL);
413 cxdrp = xdrs;
414 op = RDMA_MSG;
415 }
416 } else {
417 /*
418 * For RPCSEC_GSS since we cannot accurately presize the
419 * buffer required for encoding, we assume that its going
420 * to be a Long RPC to start with. We also create the
421 * the XDR stream with min_chunk set to 0 which instructs
422 * the XDR layer to not chunk the incoming byte stream.
423 */
424
425 msglen += 2 * MAX_AUTH_BYTES + 2 * sizeof (struct opaque_auth);
426 msglen += xdr_sizeof(xdr_args, argsp);
427
428 /*
429 * Long RPC. Allocate one time use custom buffer.
430 */
431 longmsg.type = CHUNK_BUFFER;
432 longmsg.addr = kmem_zalloc(msglen, KM_SLEEP);
433 cle = kmem_zalloc(sizeof (*cle), KM_SLEEP);
434 cle->c_xdroff = 0;
435 cle->c_len = longmsg.len = msglen;
436 cle->c_saddr = (uint64)(uintptr_t)longmsg.addr;
437 cle->c_next = NULL;
438 xdrrdma_create(xdrs, longmsg.addr, msglen, 0, cle,
439 XDR_ENCODE, NULL);
440 cxdrp = xdrs;
441 op = RDMA_NOMSG;
442 }
443
444 if (h->cl_auth->ah_cred.oa_flavor != RPCSEC_GSS) {
445 /*
446 * Copy in the preserialized RPC header
447 * information.
448 */
449 bcopy(p->cku_rpchdr, rpcmsg.addr, CKU_HDRSIZE);
450
451 /*
452 * transaction id is the 1st thing in the output
453 * buffer.
454 */
455 /* LINTED pointer alignment */
456 (*(uint32_t *)(rpcmsg.addr)) = p->cku_xid;
457
458 /* Skip the preserialized stuff. */
459 XDR_SETPOS(xdrs, CKU_HDRSIZE);
460
461 /* Serialize dynamic stuff into the output buffer. */
462 if ((!XDR_PUTINT32(xdrs, (int32_t *)&procnum)) ||
463 (!AUTH_MARSHALL(h->cl_auth, xdrs, p->cku_cred)) ||
464 (!(*xdr_args)(xdrs, argsp))) {
465 rdma_buf_free(conn, &rpcmsg);
466 if (cle)
467 clist_free(cle);
468 p->cku_err.re_status = RPC_CANTENCODEARGS;
469 p->cku_err.re_errno = EIO;
470 cmn_err(CE_WARN,
471 "clnt_rdma_kcallit: XDR_PUTINT32/AUTH_MARSHAL/xdr_args failed");
472 goto done;
473 }
474 p->cku_outsz = XDR_GETPOS(xdrs);
475 } else {
476 uint32_t *uproc = (uint32_t *)&p->cku_rpchdr[CKU_HDRSIZE];
477 IXDR_PUT_U_INT32(uproc, procnum);
478 (*(uint32_t *)(&p->cku_rpchdr[0])) = p->cku_xid;
479 XDR_SETPOS(xdrs, 0);
480
481 /* Serialize the procedure number and the arguments. */
482 if (!AUTH_WRAP(h->cl_auth, (caddr_t)p->cku_rpchdr,
483 CKU_HDRSIZE+4, xdrs, xdr_args, argsp)) {
484 if (longmsg.addr != xdrs->x_base) {
485 longmsg.addr = xdrs->x_base;
486 longmsg.len = xdr_getbufsize(xdrs);
487 }
488 rdma_buf_free(conn, &longmsg);
489 clist_free(cle);
490 p->cku_err.re_status = RPC_CANTENCODEARGS;
491 p->cku_err.re_errno = EIO;
492 cmn_err(CE_WARN,
493 "clnt_rdma_kcallit: AUTH_WRAP failed");
494 goto done;
495 }
496 /*
497 * If we had to allocate a new buffer while encoding
498 * then update the addr and len.
499 */
500 if (longmsg.addr != xdrs->x_base) {
501 longmsg.addr = xdrs->x_base;
502 longmsg.len = xdr_getbufsize(xdrs);
503 }
504
505 /*
506 * If it so happens that the encoded message is after all
507 * not long enough to be a Long RPC then allocate a
508 * SEND_BUFFER and copy the encoded message into it.
509 */
510 p->cku_outsz = XDR_GETPOS(xdrs);
511 if (p->cku_outsz > RPC_MSG_SZ) {
512 rpcmsg.type = CHUNK_BUFFER;
513 rpcmsg.addr = longmsg.addr;
514 rpcmsg.len = longmsg.len;
515 } else {
516 clist_free(cle);
517 XDR_DESTROY(cxdrp);
518 cxdrp = NULL;
519 /*
520 * Get a pre-allocated buffer for rpc call
521 */
522 rpcmsg.type = SEND_BUFFER;
523 if (RDMA_BUF_ALLOC(conn, &rpcmsg)) {
524 p->cku_err.re_status = RPC_CANTSEND;
525 p->cku_err.re_errno = EIO;
526 RCSTAT_INCR(rcnomem);
527 cmn_err(CE_WARN,
528 "clnt_rdma_kcallit: no buffers!");
529 rdma_buf_free(conn, &longmsg);
530 goto done;
531 }
532 bcopy(longmsg.addr, rpcmsg.addr, p->cku_outsz);
533 xdrrdma_create(xdrs, rpcmsg.addr, p->cku_outsz, 0,
534 NULL, XDR_ENCODE, NULL);
535 cxdrp = xdrs;
536 rdma_buf_free(conn, &longmsg);
537 op = RDMA_MSG;
538 }
539 }
540
541 cl = xdrrdma_clist(xdrs);
542
543 /*
544 * Update the chunk size information for the Long RPC msg.
545 */
546 if (cl && op == RDMA_NOMSG)
547 cl->c_len = p->cku_outsz;
548
549 /*
550 * Set up the RDMA chunk message
551 */
552 vers = RPCRDMA_VERS;
553 clmsg.type = SEND_BUFFER;
554 if (RDMA_BUF_ALLOC(conn, &clmsg)) {
555 p->cku_err.re_status = RPC_CANTSEND;
556 p->cku_err.re_errno = EIO;
557 rdma_buf_free(conn, &rpcmsg);
558 RCSTAT_INCR(rcnomem);
559 cmn_err(CE_WARN, "clnt_rdma_kcallit: no free buffers!!");
560 goto done;
561 }
562 xdrs = &p->cku_outxdr;
563 xdrmem_create(xdrs, clmsg.addr, clmsg.len, XDR_ENCODE);
564 /*
565 * Treat xid as opaque (xid is the first entity
566 * in the rpc rdma message).
567 */
568 (*(uint32_t *)clmsg.addr) = p->cku_xid;
569 /* Skip xid and set the xdr position accordingly. */
570 XDR_SETPOS(xdrs, sizeof (uint32_t));
571 (void) xdr_u_int(xdrs, &vers);
572 (void) xdr_u_int(xdrs, &op);
573
574 /*
575 * Now XDR the chunk list
576 */
577 if (cl != NULL) {
578
579 /*
580 * Register the chunks in the list
581 */
582 status = clist_register(conn, cl, 1);
583 if (status != RDMA_SUCCESS) {
584 cmn_err(CE_WARN,
585 "clnt_rdma_kcallit: clist register failed");
586 rdma_buf_free(conn, &clmsg);
587 rdma_buf_free(conn, &rpcmsg);
588 clist_free(cl);
589 p->cku_err.re_status = RPC_CANTSEND;
590 p->cku_err.re_errno = EIO;
591 goto done;
592 }
593
594 }
595 (void) xdr_do_clist(xdrs, &cl);
596
597 /*
598 * Start with the RDMA header and clist (if any)
599 */
600 sendlist = NULL;
601 clist_add(&sendlist, 0, XDR_GETPOS(xdrs), &clmsg.handle,
602 clmsg.addr, NULL, NULL);
603
604 /*
605 * Put the RPC call message in the send list if small RPC
606 */
607 if (op == RDMA_MSG) {
608 clist_add(&sendlist, 0, p->cku_outsz, &rpcmsg.handle,
609 rpcmsg.addr, NULL, NULL);
610 } else {
611 /* Long RPC already in chunk list */
612 RCSTAT_INCR(rclongrpcs);
613 }
614
615 /*
616 * Set up a reply buffer ready for the reply
617 */
618 status = rdma_clnt_postrecv(conn, p->cku_xid);
619 if (status != RDMA_SUCCESS) {
620 rdma_buf_free(conn, &clmsg);
621 rdma_buf_free(conn, &rpcmsg);
622 if (cl) {
623 (void) clist_deregister(conn, cl, 1);
624 clist_free(cl);
625 }
626 clist_free(sendlist);
627 p->cku_err.re_status = RPC_CANTSEND;
628 p->cku_err.re_errno = EIO;
629 goto done;
630 }
631 /*
632 * sync the memory for dma
633 */
634 if (cl != NULL) {
635 status = clist_syncmem(conn, cl, 1);
636 if (status != RDMA_SUCCESS) {
637 rdma_buf_free(conn, &clmsg);
638 rdma_buf_free(conn, &rpcmsg);
639 (void) clist_deregister(conn, cl, 1);
640 clist_free(cl);
641 clist_free(sendlist);
642 p->cku_err.re_status = RPC_CANTSEND;
643 p->cku_err.re_errno = EIO;
644 goto done;
645 }
646 }
647
648 /*
649 * Send the call message to the server
650 */
651 status = RDMA_SEND(conn, sendlist, p->cku_xid);
652 if (status != RDMA_SUCCESS) {
653 if (cl) {
654 (void) clist_deregister(conn, cl, 1);
655 clist_free(cl);
656 /*
657 * If this was a long RPC message, need
658 * to free that buffer.
659 */
660 if (rpcmsg.type == CHUNK_BUFFER)
661 rdma_buf_free(conn, &rpcmsg);
662 }
663 clist_free(sendlist);
664 p->cku_err.re_status = RPC_CANTSEND;
665 p->cku_err.re_errno = EIO;
666 goto done;
667 } else {
668 /*
669 * RDMA plugin now owns the send msg buffers.
670 * Clear them out and don't free them here.
671 */
672 clmsg.addr = NULL;
673 if (rpcmsg.type == SEND_BUFFER)
674 rpcmsg.addr = NULL;
675 }
676 clist_free(sendlist);
677 #ifdef DEBUG
678 if (rdma_clnt_debug) {
679 printf("clnt_rdma_kcallit: send request xid %u\n", p->cku_xid);
680 }
681 #endif
682
683 /*
684 * Recv rpc reply
685 */
686 status = RDMA_RECV(conn, &recvlist, p->cku_xid);
687
688 /*
689 * Deregister chunks sent. Do this only after the reply
690 * is received as that is a sure indication that the
691 * remote end has completed RDMA of the chunks.
692 */
693 if (cl != NULL) {
694 /*
695 * Deregister the chunks
696 */
697 (void) clist_deregister(conn, cl, 1);
698 clist_free(cl);
699 /*
700 * If long RPC free chunk
701 */
702 rdma_buf_free(conn, &rpcmsg);
703 }
704
705 /*
706 * Now check recv status
707 */
708 if (status != 0) {
709 #ifdef DEBUG
710 if (rdma_clnt_debug)
711 cmn_err(CE_NOTE,
712 "clnt_rdma_kcallit: reply failed %u status %d",
713 p->cku_xid, status);
714 #endif
715 if (status == RDMA_INTR) {
716 p->cku_err.re_status = RPC_INTR;
717 p->cku_err.re_errno = EINTR;
718 RCSTAT_INCR(rcintrs);
719 } else if (status == RPC_TIMEDOUT) {
720 p->cku_err.re_status = RPC_TIMEDOUT;
721 p->cku_err.re_errno = ETIMEDOUT;
722 RCSTAT_INCR(rctimeouts);
723 } else {
724 p->cku_err.re_status = RPC_CANTRECV;
725 p->cku_err.re_errno = EIO;
726 }
727 goto done;
728 }
729 #ifdef DEBUG
730 if (rdma_clnt_debug)
731 printf("clnt_rdma_kcallit: got response xid %u\n", p->cku_xid);
732 #endif
733 /*
734 * Process the reply message.
735 *
736 * First the chunk list (if any)
737 */
738 xdrs = &(p->cku_inxdr);
739 xdrmem_create(xdrs, (caddr_t)(uintptr_t)recvlist->c_saddr,
740 recvlist->c_len, XDR_DECODE);
741 /*
742 * Treat xid as opaque (xid is the first entity
743 * in the rpc rdma message).
744 */
745 xid = *(uint32_t *)(uintptr_t)recvlist->c_saddr;
746 /* Skip xid and set the xdr position accordingly. */
747 XDR_SETPOS(xdrs, sizeof (uint32_t));
748 (void) xdr_u_int(xdrs, &vers);
749 (void) xdr_u_int(xdrs, &op);
750 (void) xdr_do_clist(xdrs, &cl);
751 off = xdr_getpos(xdrs);
752
753 /*
754 * Now the RPC reply message itself. If the reply
755 * came as a chunk item, then RDMA the reply over.
756 */
757 xdrs = &replxdr;
758 if (cl && op == RDMA_NOMSG) {
759 struct clist *cle = cl;
760
761 rpcreply.type = CHUNK_BUFFER;
762 rpcreply.addr = kmem_alloc(cle->c_len, KM_SLEEP);
763 rpcreply.len = cle->c_len;
764 cle->c_daddr = (uint64)(uintptr_t)rpcreply.addr;
765 cl = cl->c_next;
766 cle->c_next = NULL;
767
768 /*
769 * Register the rpc reply chunk destination
770 */
771 status = clist_register(conn, cle, 0);
772 if (status) {
773 rdma_buf_free(conn, &rpcreply);
774 clist_free(cle);
775 p->cku_err.re_status = RPC_CANTDECODERES;
776 p->cku_err.re_errno = EIO;
777 cmn_err(CE_WARN,
778 "clnt_rdma_kcallit: clist_register failed");
779 goto rdma_done;
780 }
781
782 /*
783 * Now read rpc reply in
784 */
785 #ifdef DEBUG
786 if (rdma_clnt_debug)
787 printf("clnt_rdma_kcallit: read chunk, len %d, xid %u, \
788 reply xid %u\n", cle->c_len, p->cku_xid, xid);
789 #endif
790 status = RDMA_READ(conn, cle, WAIT);
791 if (status) {
792 (void) clist_deregister(conn, cle, 0);
793 rdma_buf_free(conn, &rpcreply);
794 clist_free(cle);
795 p->cku_err.re_status = RPC_CANTDECODERES;
796 p->cku_err.re_errno = EIO;
797 cmn_err(CE_WARN,
798 "clnt_rdma_kcallit: RDMA_READ failed");
799 goto rdma_done;
800 }
801
802 /*
803 * sync the memory for dma
804 */
805 status = clist_syncmem(conn, cle, 0);
806 if (status != RDMA_SUCCESS) {
807 (void) clist_deregister(conn, cle, 0);
808 rdma_buf_free(conn, &rpcreply);
809 clist_free(cle);
810 p->cku_err.re_status = RPC_CANTDECODERES;
811 p->cku_err.re_errno = EIO;
812 goto rdma_done;
813 }
814
815 /*
816 * Deregister the Long RPC chunk
817 */
818 (void) clist_deregister(conn, cle, 0);
819 clist_free(cle);
820 xdrrdma_create(xdrs, rpcreply.addr, rpcreply.len, 0, cl,
821 XDR_DECODE, conn);
822 rxdrp = xdrs;
823 } else {
824 rpcreply.addr = NULL;
825 xdrrdma_create(xdrs,
826 (caddr_t)(uintptr_t)(recvlist->c_saddr + off),
827 recvlist->c_len - off, 0, cl, XDR_DECODE, conn);
828 rxdrp = xdrs;
829 }
830
831 reply_msg.rm_direction = REPLY;
832 reply_msg.rm_reply.rp_stat = MSG_ACCEPTED;
833 reply_msg.acpted_rply.ar_stat = SUCCESS;
834 reply_msg.acpted_rply.ar_verf = _null_auth;
835 /*
836 * xdr_results will be done in AUTH_UNWRAP.
837 */
838 reply_msg.acpted_rply.ar_results.where = NULL;
839 reply_msg.acpted_rply.ar_results.proc = xdr_void;
840
841 /*
842 * Decode and validate the response.
843 */
844 if (xdr_replymsg(xdrs, &reply_msg)) {
845 enum clnt_stat re_status;
846
847 _seterr_reply(&reply_msg, &(p->cku_err));
848
849 re_status = p->cku_err.re_status;
850 if (re_status == RPC_SUCCESS) {
851 /*
852 * Reply is good, check auth.
853 */
854 if (!AUTH_VALIDATE(h->cl_auth,
855 &reply_msg.acpted_rply.ar_verf)) {
856 p->cku_err.re_status = RPC_AUTHERROR;
857 p->cku_err.re_why = AUTH_INVALIDRESP;
858 RCSTAT_INCR(rcbadverfs);
859 cmn_err(CE_WARN,
860 "clnt_rdma_kcallit: AUTH_VALIDATE failed");
861 } else if (!AUTH_UNWRAP(h->cl_auth, xdrs,
862 xdr_results, resultsp)) {
863 p->cku_err.re_status = RPC_CANTDECODERES;
864 p->cku_err.re_errno = EIO;
865 cmn_err(CE_WARN,
866 "clnt_rdma_kcallit: AUTH_UNWRAP failed");
867 }
868 } else {
869 /* set errno in case we can't recover */
870 if (re_status != RPC_VERSMISMATCH &&
871 re_status != RPC_AUTHERROR &&
872 re_status != RPC_PROGVERSMISMATCH)
873 p->cku_err.re_errno = EIO;
874
875 if (re_status == RPC_AUTHERROR) {
876 /*
877 * Map recoverable and unrecoverable
878 * authentication errors to appropriate
879 * errno
880 */
881 switch (p->cku_err.re_why) {
882 case AUTH_BADCRED:
883 case AUTH_BADVERF:
884 case AUTH_INVALIDRESP:
885 case AUTH_TOOWEAK:
886 case AUTH_FAILED:
887 case RPCSEC_GSS_NOCRED:
888 case RPCSEC_GSS_FAILED:
889 p->cku_err.re_errno = EACCES;
890 break;
891 case AUTH_REJECTEDCRED:
892 case AUTH_REJECTEDVERF:
893 default:
894 p->cku_err.re_errno = EIO;
895 break;
896 }
897 RPCLOG(1, "clnt_rdma_kcallit : "
898 "authentication failed with "
899 "RPC_AUTHERROR of type %d\n",
900 p->cku_err.re_why);
901 }
902 cmn_err(CE_WARN,
903 "clnt_rdma_kcallit: RPC failed");
904
905 }
906 } else {
907 p->cku_err.re_status = RPC_CANTDECODERES;
908 p->cku_err.re_errno = EIO;
909 cmn_err(CE_WARN, "clnt_rdma_kcallit: xdr_replymsg failed");
910 }
911
912 /*
913 * If rpc reply is in a chunk, free it now.
914 */
915 if (rpcreply.addr != NULL)
916 rdma_buf_free(conn, &rpcreply);
917
918 rdma_done:
919 if ((cl != NULL) || (op == RDMA_NOMSG)) {
920 rdma_buf_t donemsg;
921
922 /*
923 * Free the list holding the chunk info
924 */
925 if (cl) {
926 clist_free(cl);
927 cl = NULL;
928 }
929
930 /*
931 * Tell the server that the reads are done
932 */
933 donemsg.type = SEND_BUFFER;
934 if (RDMA_BUF_ALLOC(conn, &donemsg)) {
935 p->cku_err.re_status = RPC_CANTSEND;
936 p->cku_err.re_errno = EIO;
937 RCSTAT_INCR(rcnomem);
938 cmn_err(CE_WARN, "clnt_rdma_kcallit: no free buffer");
939 goto done;
940 }
941 xdrs = &p->cku_outxdr;
942 xdrmem_create(xdrs, donemsg.addr, donemsg.len, XDR_ENCODE);
943 vers = RPCRDMA_VERS;
944 op = RDMA_DONE;
945
946 /*
947 * Treat xid as opaque (xid is the first entity
948 * in the rpc rdma message).
949 */
950 (*(uint32_t *)donemsg.addr) = p->cku_xid;
951 /* Skip xid and set the xdr position accordingly. */
952 XDR_SETPOS(xdrs, sizeof (uint32_t));
953 if (!xdr_u_int(xdrs, &vers) ||
954 !xdr_u_int(xdrs, &op)) {
955 cmn_err(CE_WARN,
956 "clnt_rdma_kcallit: xdr_u_int failed");
957 rdma_buf_free(conn, &donemsg);
958 goto done;
959 }
960
961 sendlist = NULL;
962 clist_add(&sendlist, 0, XDR_GETPOS(xdrs), &donemsg.handle,
963 donemsg.addr, NULL, NULL);
964
965 status = RDMA_SEND(conn, sendlist, p->cku_xid);
966 if (status != RDMA_SUCCESS) {
967 cmn_err(CE_WARN,
968 "clnt_rdma_kcallit: RDMA_SEND failed xid %u",
969 p->cku_xid);
970 }
971 #ifdef DEBUG
972 else {
973 if (rdma_clnt_debug)
974 printf("clnt_rdma_kcallit: sent RDMA_DONE xid %u\n",
975 p->cku_xid);
976 }
977 #endif
978 clist_free(sendlist);
979 }
980
981 done:
982 if (cxdrp)
983 XDR_DESTROY(cxdrp);
984 if (rxdrp) {
985 (void) xdr_rpc_free_verifier(rxdrp, &reply_msg);
986 XDR_DESTROY(rxdrp);
987 }
988
989 if (recvlist) {
990 rdma_buf_t recvmsg;
991
992 recvmsg.addr = (caddr_t)(uintptr_t)recvlist->c_saddr;
993 recvmsg.type = RECV_BUFFER;
994 RDMA_BUF_FREE(conn, &recvmsg);
995 clist_free(recvlist);
996 }
997 RDMA_REL_CONN(conn);
998 if (p->cku_err.re_status != RPC_SUCCESS) {
999 RCSTAT_INCR(rcbadcalls);
1000 }
1001 return (p->cku_err.re_status);
1002 }
1003
1004 /* ARGSUSED */
1005 static void
1006 clnt_rdma_kabort(CLIENT *h)
1007 {
1008 }
1009
1010 static void
1011 clnt_rdma_kerror(CLIENT *h, struct rpc_err *err)
1012 {
1013 struct cku_private *p = htop(h);
1014
1015 *err = p->cku_err;
1016 }
1017
1018 static bool_t
1019 clnt_rdma_kfreeres(CLIENT *h, xdrproc_t xdr_res, caddr_t res_ptr)
1020 {
1021 struct cku_private *p = htop(h);
1022 XDR *xdrs;
1023
1024 xdrs = &(p->cku_outxdr);
1025 xdrs->x_op = XDR_FREE;
1026 return ((*xdr_res)(xdrs, res_ptr));
1027 }
1028
1029 /* ARGSUSED */
1030 static bool_t
1031 clnt_rdma_kcontrol(CLIENT *h, int cmd, char *arg)
1032 {
1033 return (TRUE);
1034 }
1035
1036 /* ARGSUSED */
1037 static int
1038 clnt_rdma_ksettimers(CLIENT *h, struct rpc_timers *t, struct rpc_timers *all,
1039 int minimum, void(*feedback)(int, int, caddr_t), caddr_t arg,
1040 uint32_t xid)
1041 {
1042 RCSTAT_INCR(rctimers);
1043 return (0);
1044 }
1045
1046 int
1047 rdma_reachable(int addr_type, struct netbuf *addr, struct knetconfig **knconf)
1048 {
1049 rdma_registry_t *rp;
1050 void *handle = NULL;
1051 struct knetconfig *knc;
1052 char *pf, *p;
1053 rdma_stat status;
1054 int error = 0;
1055
1056 if (!INGLOBALZONE(curproc))
1057 return (-1);
1058 /*
1059 * modload the RDMA plugins if not already done.
1060 */
1061 if (!rdma_modloaded) {
1062 mutex_enter(&rdma_modload_lock);
1063 if (!rdma_modloaded) {
1064 error = rdma_modload();
1065 }
1066 mutex_exit(&rdma_modload_lock);
1067 if (error)
1068 return (-1);
1069 }
1070
1071 if (!rdma_dev_available)
1072 return (-1);
1073
1074 rw_enter(&rdma_lock, RW_READER);
1075 rp = rdma_mod_head;
1076 while (rp != NULL) {
1077 status = RDMA_REACHABLE(rp->r_mod->rdma_ops, addr_type, addr,
1078 &handle);
1079 if (status == RDMA_SUCCESS) {
1080 knc = kmem_zalloc(sizeof (struct knetconfig),
1081 KM_SLEEP);
1082 knc->knc_semantics = NC_TPI_RDMA;
1083 pf = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
1084 p = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
1085 if (addr_type == AF_INET)
1086 (void) strncpy(pf, NC_INET, KNC_STRSIZE);
1087 else if (addr_type == AF_INET6)
1088 (void) strncpy(pf, NC_INET6, KNC_STRSIZE);
1089 pf[KNC_STRSIZE - 1] = '\0';
1090
1091 (void) strncpy(p, rp->r_mod->rdma_api, KNC_STRSIZE);
1092 p[KNC_STRSIZE - 1] = '\0';
1093
1094 knc->knc_protofmly = pf;
1095 knc->knc_proto = p;
1096 knc->knc_rdev = (dev_t)handle;
1097 *knconf = knc;
1098 rw_exit(&rdma_lock);
1099 return (0);
1100 }
1101 rp = rp->r_next;
1102 }
1103 rw_exit(&rdma_lock);
1104 return (-1);
1105 }