New svc_rdma.c
  1 /*
  2  * CDDL HEADER START
  3  *
  4  * The contents of this file are subject to the terms of the
  5  * Common Development and Distribution License, Version 1.0 only
  6  * (the "License").  You may not use this file except in compliance
  7  * with the License.
  8  *
  9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 10  * or http://www.opensolaris.org/os/licensing.
 11  * See the License for the specific language governing permissions
 12  * and limitations under the License.
 13  *
 14  * When distributing Covered Code, include this CDDL HEADER in each
 15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 16  * If applicable, add the following below this CDDL HEADER, with the
 17  * fields enclosed by brackets "[]" replaced with your own identifying
 18  * information: Portions Copyright [yyyy] [name of copyright owner]
 19  *
 20  * CDDL HEADER END
 21  */
 22 /*
 23  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
 24  * Use is subject to license terms.
 25  */
 26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
 27 /* All Rights Reserved */
 28 /*
 29  * Portions of this source code were derived from Berkeley
 30  * 4.3 BSD under license from the Regents of the University of
 31  * California.
 32  */
 33 
 34  /* Copyright (c) 2006, The Ohio State University. All rights reserved.
 35   *
 36   * Portions of this source code is developed by the team members of
 37   * The Ohio State University's Network-Based Computing Laboratory (NBCL),
 38   * headed by Professor Dhabaleswar K. (DK) Panda.
 39   *
 40   * Acknowledgements to contributions from developors:
 41   *   Ranjit Noronha: noronha@cse.ohio-state.edu
 42   *   Lei Chai      : chail@cse.ohio-state.edu
 43   *   Weikuan Yu    : yuw@cse.ohio-state.edu
 44   *
 45   */
 46 
 47 #pragma ident   "@(#)svc_rdma.c 1.8     05/06/10 SMI"
 48 
 49 /*
 50  * Server side of RPC over RDMA in the kernel.
 51  */
 52 
 53 #include <sys/param.h>
 54 #include <sys/types.h>
 55 #include <sys/user.h>
 56 #include <sys/sysmacros.h>
 57 #include <sys/proc.h>
 58 #include <sys/file.h>
 59 #include <sys/errno.h>
 60 #include <sys/kmem.h>
 61 #include <sys/debug.h>
 62 #include <sys/systm.h>
 63 #include <sys/cmn_err.h>
 64 #include <sys/kstat.h>
 65 #include <sys/vtrace.h>
 66 #include <sys/debug.h>
 67 
 68 #include <rpc/types.h>
 69 #include <rpc/xdr.h>
 70 #include <rpc/auth.h>
 71 #include <rpc/clnt.h>
 72 #include <rpc/rpc_msg.h>
 73 #include <rpc/svc.h>
 74 #include <rpc/rpc_rdma.h>
 75 #include <sys/ddi.h>
 76 #include <sys/sunddi.h>
 77 
 78 #include <inet/common.h>
 79 #include <inet/ip.h>
 80 #include <inet/ip6.h>
 81 
 82 #include <nfs/nfs.h> 
 83 
 84 #define SVC_RDMA_SUCCESS 0
 85 #define SVC_RDMA_FAIL -1
 86 
 87 #define SVC_CREDIT_FACTOR (0.5)
 88 
 89 uint32_t rdma_bufs_granted = RDMA_BUFS_GRANT;
 90 extern xdrproc_t x_READ3res;
 91 
 92 /*
 93  * RDMA transport specific data associated with SVCMASTERXPRT
 94  */
 95 struct rdma_data {
 96         SVCMASTERXPRT   *rd_xprt;       /* back ptr to SVCMASTERXPRT */
 97         struct rdma_svc_data rd_data;   /* rdma data */
 98         rdma_mod_t      *r_mod;         /* RDMA module containing ops ptr */
 99 };
100 
101 /*
102  * Plugin connection specific data stashed away in clone SVCXPRT
103  */
104 struct clone_rdma_data {
105         CONN            *conn;          /* RDMA connection */
106         rdma_buf_t      rpcbuf;         /* RPC req/resp buffer */
107         struct clist    *reply_cl;      /* reply chunk buffer info */
108         struct clist    *wlist;         /* write list clist */
109 };
110 
111 #ifdef DEBUG
112 int rdma_svc_debug = 0;
113 #endif
114 
115 #define MAXADDRLEN      128     /* max length for address mask */
116 
117 /*
118  * Routines exported through ops vector.
119  */
120 static bool_t           svc_rdma_krecv(SVCXPRT *, mblk_t *, struct rpc_msg *);
121 static bool_t           svc_rdma_ksend(SVCXPRT *, struct rpc_msg *);
122 static bool_t           svc_rdma_kgetargs(SVCXPRT *, xdrproc_t, caddr_t);
123 static bool_t           svc_rdma_kfreeargs(SVCXPRT *, xdrproc_t, caddr_t);
124 void                    svc_rdma_kdestroy(SVCMASTERXPRT *);
125 static int              svc_rdma_kdup(struct svc_req *, caddr_t, int,
126                                 struct dupreq **, bool_t *);
127 static void             svc_rdma_kdupdone(struct dupreq *, caddr_t,
128                                 void (*)(), int, int);
129 static int32_t          *svc_rdma_kgetres(SVCXPRT *, int);
130 static void             svc_rdma_kfreeres(SVCXPRT *);
131 static void             svc_rdma_kclone_destroy(SVCXPRT *);
132 static void             svc_rdma_kstart(SVCMASTERXPRT *);
133 void                    svc_rdma_kstop(SVCMASTERXPRT *);
134 
135 static int              svc_process_wlist(struct clone_rdma_data *, xdrproc_t,
136                                           caddr_t, int *, unsigned int *);
137 
138 static int              svc_process_long_reply(SVCXPRT *, CONN *, xdrproc_t, 
139                                                caddr_t,  caddr_t vd, XDR **, 
140                                                struct rpc_msg *, bool_t, int *, 
141                                                int *, int *, unsigned int *);
142 
143 static int              svc_compose_rpcmsg(SVCXPRT *, CONN *, xdrproc_t, caddr_t,
144                                            rdma_buf_t *, XDR **, struct rpc_msg *, 
145                                            bool_t, int *, unsigned int *);
146 #ifdef DYNAMIC_CREDIT_CONTROL
147 static void             svc_consume_credit(CONN *);
148 static void             svc_compute_credit(CONN *, uint32_t, int, int, int *);
149 static void             svc_update_credit(CONN * , int);
150 static void             svc_grant_credit(CONN *, uint32_t *);
151 #endif
152 
153 /*
154  * Server transport operations vector.
155  */
156 struct svc_ops rdma_svc_ops = {
157         svc_rdma_krecv,         /* Get requests */
158         svc_rdma_kgetargs,      /* Deserialize arguments */
159         svc_rdma_ksend,         /* Send reply */
160         svc_rdma_kfreeargs,     /* Free argument data space */
161         svc_rdma_kdestroy,      /* Destroy transport handle */
162         svc_rdma_kdup,          /* Check entry in dup req cache */
163         svc_rdma_kdupdone,      /* Mark entry in dup req cache as done */
164         svc_rdma_kgetres,       /* Get pointer to response buffer */
165         svc_rdma_kfreeres,      /* Destroy pre-serialized response header */
166         svc_rdma_kclone_destroy,        /* Destroy a clone xprt */
167         svc_rdma_kstart,                /* Tell `ready-to-receive' to rpcmod */
168         rdma_get_wchunk_seg             
169 };
170 
171 /*
172  * Server statistics
173  * NOTE: This structure type is duplicated in the NFS fast path.
174  */
175 struct {
176         kstat_named_t   rscalls;
177         kstat_named_t   rsbadcalls;
178         kstat_named_t   rsnullrecv;
179         kstat_named_t   rsbadlen;
180         kstat_named_t   rsxdrcall;
181         kstat_named_t   rsdupchecks;
182         kstat_named_t   rsdupreqs;
183         kstat_named_t   rslongrpcs;
184 } rdmarsstat = {
185         { "calls",      KSTAT_DATA_UINT64 },
186         { "badcalls",   KSTAT_DATA_UINT64 },
187         { "nullrecv",   KSTAT_DATA_UINT64 },
188         { "badlen",     KSTAT_DATA_UINT64 },
189         { "xdrcall",    KSTAT_DATA_UINT64 },
190         { "dupchecks",  KSTAT_DATA_UINT64 },
191         { "dupreqs",    KSTAT_DATA_UINT64 },
192         { "longrpcs",   KSTAT_DATA_UINT64 }
193 };
194 
195 kstat_named_t *rdmarsstat_ptr = (kstat_named_t *)&rdmarsstat;
196 uint_t rdmarsstat_ndata = sizeof (rdmarsstat) / sizeof (kstat_named_t);
197 
198 #define RSSTAT_INCR(x)  rdmarsstat.x.value.ui64++
199 
200 /*
201  * Create a transport record.
202  * The transport record, output buffer, and private data structure
203  * are allocated.  The output buffer is serialized into using xdrmem.
204  * There is one transport record per user process which implements a
205  * set of services.
206  */
207 /* ARGSUSED */
208 int
209 svc_rdma_kcreate(char *netid, SVC_CALLOUT_TABLE *sct, int id,
210         rdma_xprt_group_t *started_xprts)
211 {
212         int error;
213         SVCMASTERXPRT *xprt;
214         struct rdma_data *rd;
215         rdma_registry_t *rmod;
216         rdma_xprt_record_t *xprt_rec;
217         queue_t *q;
218         
219        mutex_enter(&rdma_modload_lock);
220        error = rdma_modload();
221        mutex_exit(&rdma_modload_lock);
222 
223         /*
224          * modload the RDMA plugins is not already done.
225          */
226         if (!rdma_modloaded) {
227                 mutex_enter(&rdma_modload_lock);
228                 if (!rdma_modloaded) {
229                         error = rdma_modload();
230                 }
231                 mutex_exit(&rdma_modload_lock);
232 
233                 if (error)
234                         return (error);
235         }
236 
237         /*
238          * master_xprt_count is the count of master transport handles
239          * that were successfully created and are ready to recieve for
240          * RDMA based access.
241          */
242         error = 0;
243         xprt_rec = NULL;
244         rw_enter(&rdma_lock, RW_READER);
245         if (rdma_mod_head == NULL) {
246                 started_xprts->rtg_count = 0;
247                 rw_exit(&rdma_lock);
248                 if (rdma_dev_available)
249                         return (EPROTONOSUPPORT);
250                 else
251                         return (ENODEV);
252         }
253 
254         /*
255          * If we have reached here, then atleast one RDMA plugin has loaded.
256          * Create a master_xprt, make it start listenining on the device,
257          * if an error is generated, record it, we might need to shut
258          * the master_xprt.
259          * SVC_START() calls svc_rdma_kstart which calls plugin binding
260          * routines.
261          */
262         for (rmod = rdma_mod_head; rmod != NULL; rmod = rmod->r_next) {
263 
264                 /*
265                  * One SVCMASTERXPRT per RDMA plugin.
266                  */
267                 xprt = kmem_zalloc(sizeof (*xprt), KM_SLEEP);
268                 xprt->xp_ops = &rdma_svc_ops;
269                 xprt->xp_sct = sct;
270                 xprt->xp_type = T_RDMA;
271                 mutex_init(&xprt->xp_req_lock, NULL, MUTEX_DEFAULT, NULL);
272                 mutex_init(&xprt->xp_thread_lock, NULL, MUTEX_DEFAULT, NULL);
273                 xprt->xp_req_head = (mblk_t *)0;
274                 xprt->xp_req_tail = (mblk_t *)0;
275                 xprt->xp_threads = 0;
276                 xprt->xp_detached_threads = 0;
277 
278                 rd = kmem_zalloc(sizeof (*rd), KM_SLEEP);
279                 xprt->xp_p2 = (caddr_t)rd;
280                 rd->rd_xprt = xprt;
281                 rd->r_mod = rmod->r_mod;
282 
283                 q = &rd->rd_data.q;
284                 xprt->xp_wq = q;
285                 q->q_ptr = &rd->rd_xprt;
286                 xprt->xp_netid = NULL;
287 
288                 if (netid != NULL) {
289                         xprt->xp_netid = kmem_alloc(strlen(netid) + 1,
290                                                 KM_SLEEP);
291                         (void) strcpy(xprt->xp_netid, netid);
292                 }
293 
294                 xprt->xp_addrmask.maxlen =
295                     xprt->xp_addrmask.len = sizeof (struct sockaddr_in);
296                 xprt->xp_addrmask.buf =
297                     kmem_zalloc(xprt->xp_addrmask.len, KM_SLEEP);
298                 ((struct sockaddr_in *)xprt->xp_addrmask.buf)->sin_addr.s_addr =
299                     (uint32_t)~0;
300                 ((struct sockaddr_in *)xprt->xp_addrmask.buf)->sin_family =
301                     (ushort_t)~0;
302 
303                 /*
304                  * Each of the plugins will have their own Service ID
305                  * to listener specific mapping, like port number for VI
306                  * and service name for IB.
307                  */
308                 rd->rd_data.svcid = id;
309                 error = svc_xprt_register(xprt, id);
310                 if (error) {
311                         cmn_err(CE_WARN, "svc_rdma_kcreate: svc_xprt_register"
312                                 "failed");
313                         goto cleanup;
314                 }
315 
316                 SVC_START(xprt);
317                 if (!rd->rd_data.active) {
318                         svc_xprt_unregister(xprt);
319                         error = rd->rd_data.err_code;
320                         goto cleanup;
321                 }
322 
323                 /*
324                  * This is set only when there is atleast one or more
325                  * transports successfully created. We insert the pointer
326                  * to the created RDMA master xprt into a separately maintained
327                  * list. This way we can easily reference it later to cleanup,
328                  * when NFS kRPC service pool is going away/unregistered.
329                  */
330                 started_xprts->rtg_count ++;
331                 xprt_rec = kmem_alloc(sizeof (*xprt_rec), KM_SLEEP);
332                 xprt_rec->rtr_xprt_ptr = xprt;
333                 xprt_rec->rtr_next = started_xprts->rtg_listhead;
334                 started_xprts->rtg_listhead = xprt_rec;
335                 continue;
336 cleanup:
337                 SVC_DESTROY(xprt);
338                 if (error == RDMA_FAILED)
339                         error = EPROTONOSUPPORT;
340         }
341 
342         rw_exit(&rdma_lock);
343 
344         /*
345          * Don't return any error even if a single plugin was started
346          * successfully.
347          */
348         if (started_xprts->rtg_count == 0)
349                 return (error);
350         return (0);
351 }
352 
353 /*
354  * Cleanup routine for freeing up memory allocated by
355  * svc_rdma_kcreate()
356  */
357 void
358 svc_rdma_kdestroy(SVCMASTERXPRT *xprt)
359 {
360         struct rdma_data *rd = (struct rdma_data *)xprt->xp_p2;
361 
362 
363         mutex_destroy(&xprt->xp_req_lock);
364         mutex_destroy(&xprt->xp_thread_lock);
365         kmem_free(xprt->xp_netid, strlen(xprt->xp_netid) + 1);
366         kmem_free(rd, sizeof (*rd));
367         kmem_free(xprt->xp_addrmask.buf, xprt->xp_addrmask.maxlen);
368         kmem_free(xprt, sizeof (*xprt));
369 }
370 
371 
372 static void
373 svc_rdma_kstart(SVCMASTERXPRT *xprt)
374 {
375         struct rdma_svc_data *svcdata;
376         rdma_mod_t *rmod;
377 
378         svcdata = &((struct rdma_data *)xprt->xp_p2)->rd_data;
379         rmod = ((struct rdma_data *)xprt->xp_p2)->r_mod;
380 
381         /*
382          * Create a listener for  module at this port
383          */
384 
385         (*rmod->rdma_ops->rdma_svc_listen)(svcdata);
386 }
387 
388 void
389 svc_rdma_kstop(SVCMASTERXPRT *xprt)
390 {
391         struct rdma_svc_data *svcdata;
392         rdma_mod_t *rmod;
393 
394         svcdata = &((struct rdma_data *)xprt->xp_p2)->rd_data;
395         rmod = ((struct rdma_data *)xprt->xp_p2)->r_mod;
396 
397         /*
398          * Call the stop listener routine for each plugin.
399          */
400         (*rmod->rdma_ops->rdma_svc_stop)(svcdata);
401         if (svcdata->active)
402                 cmn_err(CE_WARN, "rdma_stop: Failed to shutdown RDMA based kRPC"
403                         "  listener");
404 }
405 
406 /* ARGSUSED */
407 static void
408 svc_rdma_kclone_destroy(SVCXPRT *clone_xprt)
409 {
410 }
411 
412 static bool_t
413 svc_rdma_krecv(SVCXPRT *clone_xprt, mblk_t *mp, struct rpc_msg *msg)
414 {
415         XDR  *xdrs;
416         CONN *conn;
417 
418         struct recv_data       *rdp = (struct recv_data *)mp->b_rptr;
419         struct clone_rdma_data *vd;
420         struct clist           *cl = NULL;
421         struct clist           *wcl = NULL;
422         struct clist           *repcl = NULL;
423         struct clist           *cllong = NULL;
424 
425         rdma_stat               status;
426         rdma_srv_cred_ctrl_t   *cc_info;
427 
428         uint32_t vers, op, pos, xid;
429         uint32_t rdma_credit;
430         uint32_t wcl_total_length = 0;
431         bool_t   wwl= FALSE;
432         int i, numclnts, availbufs, to_be_posted;
433 #ifdef SERVER_REG_CACHE
434         rib_lrc_entry_t *long_reply_buf = NULL;
435 #endif
436         vd = (struct clone_rdma_data *)clone_xprt->xp_p2buf;
437         RSSTAT_INCR(rscalls);
438         conn = rdp->conn;
439 
440 #ifdef DYNAMIC_CREDIT_CONTROL
441         RDMA_GET_RESOURCE_INFO(conn, &numclnts, &availbufs);
442         svc_consume_credit(conn);
443 #else
444         status = rdma_svc_postrecv(conn);
445         if (status != RDMA_SUCCESS) {
446                 cmn_err(CE_NOTE,
447                         "svc_rdma_krecv: rdma_svc_postrecv failed %d", status);
448                 goto badrpc_call;
449         }
450 #endif
451 
452         xdrs = &clone_xprt->xp_xdrin;
453         xdrmem_create(xdrs, rdp->rpcmsg.addr, rdp->rpcmsg.len, XDR_DECODE);
454         xid = *(uint32_t *)rdp->rpcmsg.addr;
455         XDR_SETPOS(xdrs, sizeof (uint32_t));
456 
457         if (! xdr_u_int(xdrs, &vers) ||
458             ! xdr_u_int(xdrs, &rdma_credit) ||
459             ! xdr_u_int(xdrs, &op)) {
460                 cmn_err(CE_WARN, "svc_rdma_krecv: xdr_u_int failed");
461                 goto xdr_err;
462         }
463 
464 #ifdef DYNAMIC_CREDIT_CONTROL
465         svc_compute_credit(conn, rdma_credit, numclnts, availbufs, &to_be_posted);
466         for(i=0; i<to_be_posted; i++){
467                 status = rdma_svc_postrecv(conn);
468                 if (status != RDMA_SUCCESS) {
469                         cmn_err(CE_NOTE,
470                                 "svc_rdma_krecv: rdma_svc_postrecv failed %d", status);
471                         goto badrpc_call;
472                 }
473         }
474         svc_update_credit(conn, to_be_posted);         
475 #endif
476 
477         if (rdp->status != 0) {
478                 cmn_err(CE_NOTE,
479                         "svc_rdma_krecv: invalid status %d",
480                         rdp->status);
481                 goto badrpc_call;
482         }
483 
484         if (! xdr_do_clist(xdrs, &cl)) {
485                 cmn_err(CE_WARN, "svc_rdma_krecv: xdr_do_clist failed");
486                 goto xdr_err;
487         }
488 
489         if (!xdr_decode_wlist_new(xdrs, &wcl, &wwl, &wcl_total_length,conn)) {
490                 cmn_err(CE_NOTE, "svc recv: xdr_decode_wlist failed");
491                 if (cl)
492                         clist_free(cl);
493                 goto xdr_err;
494         }
495         vd->wlist = wcl;
496 
497         (void) xdr_decode_reply_wchunk(xdrs, &repcl, conn);
498         vd->reply_cl = repcl;
499 
500         /*
501          * A chunk at 0 offset indicates that the RPC call message
502          * is in a chunk. Get the RPC call message chunk.
503          */
504         if (cl != NULL && op == RDMA_NOMSG) {
505 
506                 /* Remove RPC call message chunk from chunklist */
507                 cllong = cl;
508                 cl = cl->c_next;
509                 cllong->c_next = NULL;
510 
511                 /* Allocate and register memory for the RPC call msg chunk */
512 #ifdef SERVER_REG_CACHE
513                 long_reply_buf         = RDMA_GET_SERVER_CACHE_BUF(conn,cllong->c_len);
514                 cllong->long_reply_buf = (uint64)long_reply_buf;
515                 cllong->c_daddr        = (uint64)(uintptr_t) long_reply_buf->lrc_buf;
516 #else
517                 cllong->c_daddr = (uint64)(uintptr_t)
518                             kmem_alloc(cllong->c_len, KM_SLEEP);
519 #endif
520                 if (cllong->c_daddr == NULL) {
521                         cmn_err(CE_WARN, "svc krecv: no memory for rpc call");
522                         clist_free(cllong);
523                         goto cll_malloc_err;
524                 }
525 
526                 status = clist_register(conn, cllong, 0);
527                 if (status) {
528                         cmn_err(CE_WARN, "svc krecv: clist_register failed");
529 #ifdef  SERVER_REG_CACHE
530                 RDMA_FREE_SERVER_CACHE_BUF(conn, (rib_lrc_entry_t *)cllong->long_reply_buf);
531 #else
532                         if(cllong->c_len)
533                         kmem_free((void *)(uintptr_t)cllong->c_daddr,
534                                 cllong->c_len);
535 #endif
536                         if(cllong)
537                         clist_free(cllong);
538                         goto cll_malloc_err;
539                 }
540 
541                 /*
542                  * Now read the RPC call message in
543                  */
544                 status = RDMA_READ(conn, cllong, WAIT);
545                 if (status) {
546                         cmn_err(CE_WARN, "svc_rdma_krecv: rdma_read failed");
547                         (void) clist_deregister(conn, cllong, 0);
548 #ifdef  SERVER_REG_CACHE
549                 RDMA_FREE_SERVER_CACHE_BUF(conn, (rib_lrc_entry_t *)cllong->long_reply_buf);
550 #else
551                         kmem_free((void *)(uintptr_t)cllong->c_daddr,
552                                 cllong->c_len);
553 #endif
554                         clist_free(cllong);
555                         goto cll_malloc_err;
556                 }
557 
558                 status = clist_syncmem(conn, cllong, 0);
559                 (void) clist_deregister(conn, cllong, 0);
560 
561                 xdrrdma_create(xdrs, (caddr_t)(uintptr_t)cllong->c_daddr,
562                     cllong->c_len, 0, cl, XDR_DECODE, conn);
563 
564                 vd->rpcbuf.type = CHUNK_BUFFER;
565                 vd->rpcbuf.addr = (caddr_t)(uintptr_t)cllong->c_daddr;
566                 vd->rpcbuf.len = cllong->c_len;
567                 vd->rpcbuf.handle.mrc_rmr = 0;
568 #ifdef  SERVER_REG_CACHE
569                 vd->rpcbuf.long_reply_buf = (rib_lrc_entry_t *)cllong->long_reply_buf;
570 #endif
571                 clist_free(cllong);
572                 RDMA_BUF_FREE(conn, &rdp->rpcmsg);
573         } else {
574                 pos = XDR_GETPOS(xdrs);
575                 xdrrdma_create(xdrs, rdp->rpcmsg.addr + pos,
576                         rdp->rpcmsg.len - pos, 0, cl, XDR_DECODE, conn);
577                 vd->rpcbuf = rdp->rpcmsg;
578         }
579 
580         if (! xdr_callmsg(xdrs, msg)) {
581                 cmn_err(CE_WARN, "svc_rdma_krecv: xdr_callmsg failed");
582                 RSSTAT_INCR(rsxdrcall);
583                 goto callmsg_err;
584         }
585 
586         /*
587          * wlist sent for something besides NFS3 READ, so ignore it.
588          * FTDO: this isn't appropriate for READLINK3, but our client
589          * will never drive writelist for READLINK3, so good enough
590          * for the demo.
591          */
592         if (vd->wlist != NULL &&
593             (msg->rm_call.cb_rpcvers != RPC_MSG_VERSION ||
594              msg->rm_call.cb_prog != NFS3_PROGRAM ||
595              msg->rm_call.cb_vers != NFS_V3 ||
596              msg->rm_call.cb_proc != NFSPROC3_READ)) {
597 #ifdef  SERVER_REG_CACHE
598                 RDMA_FREE_SERVER_CACHE_BUF(conn,(rib_lrc_entry_t *)wcl->long_reply_buf);
599 #else
600                 kmem_free((void *)wcl->c_saddr, wcl_total_length);
601 #endif
602                 clist_free(wcl);
603                 vd->wlist = NULL;
604         }
605 
606         /*
607          * Point the remote transport address in the service_transport
608          * handle at the address in the request.
609          */
610         clone_xprt->xp_rtaddr.buf = conn->c_raddr.buf;
611         clone_xprt->xp_rtaddr.len = conn->c_raddr.len;
612         clone_xprt->xp_rtaddr.maxlen = conn->c_raddr.len;
613         clone_xprt->xp_xid = xid;
614         vd->conn = conn;
615 
616         freeb(mp);
617         return (TRUE);
618 callmsg_err:
619         rdma_buf_free(conn, &vd->rpcbuf);
620 cll_malloc_err:
621         if (cl)
622                 clist_free(cl);
623         if (wcl != NULL) {
624 #ifdef  SERVER_REG_CACHE
625                 RDMA_FREE_SERVER_CACHE_BUF(conn, (rib_lrc_entry_t *)wcl->long_reply_buf);
626 #else
627                 kmem_free((void *)wcl->c_saddr, wcl_total_length);
628 #endif
629                 clist_free(wcl);
630         }
631 xdr_err:
632         XDR_DESTROY(xdrs);
633 badrpc_call:
634         RDMA_BUF_FREE(conn, &rdp->rpcmsg);
635         RDMA_REL_CONN(conn);
636         freeb(mp);
637         RSSTAT_INCR(rsbadcalls);
638         return (FALSE);
639 }
640 
641 #ifdef DYNAMIC_CREDIT_CONTROL
642 static void
643 svc_consume_credit(CONN *conn)
644 {
645         rdma_srv_cred_ctrl_t *cc_info;
646 
647         mutex_enter(&conn->c_lock);
648         cc_info = &conn->rdma_conn_cred_ctrl_u.c_srv_cc;
649         cc_info->srv_cc_posted--;
650         mutex_exit(&conn->c_lock);
651 }
652 
653 static void
654 svc_compute_credit(CONN *conn, uint32_t rdma_credit, int numclnts, 
655                    int availbufs, int *to_be_posted)
656 {
657         int average, grant;
658         rdma_srv_cred_ctrl_t *cc_info = &conn->rdma_conn_cred_ctrl_u.c_srv_cc;
659 
660         if(numclnts == 0){
661                 cmn_err(CE_NOTE, "There is no active client!\n");
662                 *to_be_posted = 0;
663                 return;
664         }
665 
666         average = availbufs/numclnts;
667 
668         mutex_enter(&conn->c_lock);
669         
670         if(rdma_credit <= cc_info->srv_cc_posted)
671                 grant = cc_info->srv_cc_posted;
672         else if(rdma_credit <= average)
673                 grant = rdma_credit;
674         else 
675                 grant = average + (rdma_credit - average) * SVC_CREDIT_FACTOR;
676 
677         *to_be_posted = grant - cc_info->srv_cc_posted;
678         if(*to_be_posted < 0) 
679                 *to_be_posted = 0;
680         if(*to_be_posted > availbufs)
681                 *to_be_posted = availbufs/2;
682     
683         mutex_exit(&conn->c_lock); 
684 }
685 
686 static void
687 svc_grant_credit(CONN * conn, uint32_t * rdma_credit)
688 {
689         rdma_srv_cred_ctrl_t *cc_info = &conn->rdma_conn_cred_ctrl_u.c_srv_cc;
690      
691         mutex_enter(&conn->c_lock);
692 
693         /*
694          * Fill in the granted number of buffers
695          * for credit control.
696          *
697          * XXX Currently ignoring what the client sends.
698          */
699         *rdma_credit = cc_info->srv_cc_buffers_granted;
700         mutex_exit(&conn->c_lock);
701 }
702 
703 static void
704 svc_update_credit(CONN * conn, int i)
705 {
706         rdma_srv_cred_ctrl_t *cc_info = &conn->rdma_conn_cred_ctrl_u.c_srv_cc;
707 
708         mutex_enter(&conn->c_lock);
709         cc_info->srv_cc_buffers_granted = cc_info->srv_cc_posted + i;
710         cc_info->srv_cc_posted = cc_info->srv_cc_buffers_granted;
711         mutex_exit(&conn->c_lock);
712 }
713 #endif
714 
715 static int
716 svc_process_wlist(struct clone_rdma_data *vd, xdrproc_t xdr_results,
717                   caddr_t xdr_location, int *num_wsegment,
718                   unsigned int *templen)
719 {
720         struct clist *wcl;
721         int data_len, avail_len, num, status;
722         READ3resok *rok;
723        
724         rok = &(((READ3res *) xdr_location)->res_u.ok);
725         data_len = num = avail_len = 0;
726 
727         wcl = vd->wlist;
728         while (wcl != NULL) {
729                 if (wcl->c_dmemhandle.mrc_rmr != 0
730                     && xdr_results == x_READ3res) {
731 
732                         avail_len += wcl->c_len;
733                         if (wcl->c_len < rok->count) {
734                                 data_len += wcl->c_len;
735                         } else {
736                                 /* Can make the rest chunks all 0-len */
737                                 data_len += rok->count;
738                                 wcl->c_len = rok->count;
739                         }
740                         rok->count -= wcl->c_len;
741                         num ++;
742                 }
743                 else {
744                         cmn_err(CE_NOTE,
745                                 "svc_process_wlist: wlist has an error\n");
746                 }
747                 wcl = wcl->c_next;
748         }
749 
750         /*
751          * MUST fail if there are still more data 
752          */
753         if (rok->count > 0) {
754                 cmn_err(CE_NOTE,
755                         "svc_process_wlist: data_len is too short \n");
756                 return SVC_RDMA_FAIL;
757         }
758 
759         wcl = vd->wlist;
760         rok->count = data_len;
761         rok->wlist_len = data_len;
762         rok->wlist = wcl;
763         *num_wsegment = num;
764         *templen = avail_len;
765 
766         /* Register, sync and write over the data */
767         if (data_len > 0) {
768                 status = clist_register(vd->conn, wcl, TRUE);
769                 if (status != RDMA_SUCCESS) {
770                         cmn_err(CE_NOTE,
771                                 "svc_process_wlist: clist_register "
772                                 "failed");
773                         return SVC_RDMA_FAIL;
774                 }
775 
776                 status = clist_syncmem(vd->conn, wcl, TRUE);
777                 if (status != RDMA_SUCCESS) {
778                         cmn_err(CE_NOTE,
779                                 "svc_process_wlist: syncmem failed(%d)",
780                                 status);
781                         return SVC_RDMA_FAIL;
782                 }
783 
784                 status = RDMA_WRITE(vd->conn, wcl, NOWAIT);
785                 if (status != RDMA_SUCCESS) {
786                         cmn_err(CE_NOTE,
787                                 "svc_process_wlist: RDMA_WRITE failed(%d)",
788                                 status);
789                         return SVC_RDMA_FAIL;
790                 }
791         }
792 
793         return SVC_RDMA_SUCCESS;
794 }
795 
796 static int
797 svc_process_long_reply(SVCXPRT * clone_xprt, CONN * conn,
798                        xdrproc_t xdr_results, caddr_t xdr_location,
799                        caddr_t vd, XDR ** xdrs,
800                        struct rpc_msg *msg, bool_t has_args, int *msglen,
801                        int *freelen, int *num, unsigned int *len)
802 {
803         rdma_buf_t long_rpc = {0};
804         int status;
805         struct clist *ncl = NULL, *wcl = NULL;
806         char *memp = NULL;
807         int avail_len = 0;
808         int count = 0;
809         int data_len = 0;
810         *num = 0;
811         *freelen = 0;
812         /*
813          * If the clone_xprt struct has a reply chunk list,
814          * then we MUST RDMA_WRITE the reply back to the client,
815          * no matter what its size is.  This translates to:
816          *
817          *      RDMA_WRITE + RDMA_SEND(op = RDMA_NOMSG)
818          *
819          * XXX the rdma write code currently ignores kerberos.
820          */
821 
822         (*msglen) += xdrrdma_sizeof(xdr_results, xdr_location, rdma_minchunk);
823 
824         wcl = (struct clist *)vd;
825         count = *msglen;
826         while (wcl != NULL) {
827                 *freelen += wcl->c_len;
828                 if (wcl->c_dmemhandle.mrc_rmr != 0)
829                      {
830                         avail_len += wcl->c_len;
831                         if (wcl->c_len < count) {
832                                 data_len += wcl->c_len;
833                         } else {
834                                 data_len += count;
835                                 wcl->c_len = count;
836                         }
837                         count -= wcl->c_len;
838                         *num +=  1;
839                 }
840                 else {
841                         cmn_err(CE_NOTE,
842                                 "svc_process_long_reply: wchunk list has an error\n");
843                 }
844                 wcl = wcl->c_next;
845         }
846 
847         /*
848          * MUST fail if there are still more data 
849          */
850         if (count > 0) {
851                 cmn_err(CE_NOTE,
852                         "svc_process_long_reply: data_len is too short \n");
853                 return SVC_RDMA_FAIL;
854         }
855         /*
856          * Setup buffers for long rpc reply
857          */
858 
859         /*
860          * We specify 0 for the chunk size since we
861          * don't want a chunk list.
862          */
863         wcl = (struct clist *)vd;
864         xdrrdma_create(*xdrs, (caddr_t)wcl->c_saddr , *msglen, 0,
865                        wcl, XDR_ENCODE, NULL);
866 
867         msg->rm_xid = clone_xprt->xp_xid;
868 
869         if (!(xdr_replymsg(*xdrs, msg) &&
870               (!has_args || SVCAUTH_WRAP(&clone_xprt->xp_auth, *xdrs,
871                                          xdr_results, xdr_location)))) {
872                 kmem_free((void *)wcl->c_saddr, *freelen);
873                 cmn_err(CE_WARN, "svc_process_long_reply: "
874                         "xdr_replymsg/SVCAUTH_WRAP failed "
875                         "for long reply\n");
876                 return SVC_RDMA_FAIL;
877         }
878         *len = XDR_GETPOS(*xdrs);
879 
880         if (clist_register(conn, wcl, TRUE) != RDMA_SUCCESS) {
881 #ifdef SERVER_REG_CACHE
882         RDMA_FREE_SERVER_CACHE_BUF(conn, (rib_lrc_entry_t *)wcl->long_reply_buf);
883 #else
884                 kmem_free((void *)(wcl->c_saddr), *freelen);
885 #endif
886                 cmn_err(CE_NOTE, "svc_process_long_reply: RDMA_WRITE: "
887                         "clist reg failed");
888                 return SVC_RDMA_FAIL;
889         }
890 
891         status = clist_syncmem(conn, wcl, TRUE);
892         if (status) {
893                 (void) clist_deregister(conn, wcl, TRUE);
894 #ifdef SERVER_REG_CACHE
895         RDMA_FREE_SERVER_CACHE_BUF(conn, (rib_lrc_entry_t *)wcl->long_reply_buf);
896 #else
897                 kmem_free((void *)(wcl->c_saddr), *freelen);
898 #endif
899                 cmn_err(CE_NOTE,
900                         "svc_process_long_reply: sync mem failed %d", status);
901                 return SVC_RDMA_FAIL;
902         }
903 
904         /*
905          * Note: we must pass WAIT into the rdma write call to
906          * ensure that the call completes before we move on, where
907          * part of 'moving on' is deregistering the memory -- and
908          * if the memory is deregistered before the write completes
909          * we'll have an error.
910          */
911         status = RDMA_WRITE(conn, wcl, NOWAIT);
912         if (status != RDMA_SUCCESS) {
913                 (void) clist_deregister(conn, wcl, TRUE);
914 #ifdef SERVER_REG_CACHE
915         RDMA_FREE_SERVER_CACHE_BUF(conn, (rib_lrc_entry_t *)wcl->long_reply_buf);
916 #else
917                 kmem_free((void *)(wcl->c_saddr), *freelen);
918 #endif
919                 cmn_err(CE_NOTE,
920                         "svc_process_long_reply: RDMA_WRITE failed %d",
921                         status);
922                 return SVC_RDMA_FAIL;
923         }
924 
925         return SVC_RDMA_SUCCESS;
926 }
927 
928 static int
929 svc_compose_rpcmsg(SVCXPRT * clone_xprt, CONN * conn, xdrproc_t xdr_results,
930                    caddr_t xdr_location, rdma_buf_t * rpcreply, XDR ** xdrs,
931                    struct rpc_msg *msg, bool_t has_args, int *msglen,
932                    unsigned int *len)
933 {
934         int auth_flavor = msg->rm_reply.rp_acpt.ar_verf.oa_flavor;
935 
936         if (has_args && auth_flavor != RPCSEC_GSS)
937                 (*msglen) += xdrrdma_sizeof(xdr_results, xdr_location,
938                                           rdma_minchunk);
939         else if (has_args && auth_flavor == RPCSEC_GSS) {
940                 (*msglen) +=
941                         2 * MAX_AUTH_BYTES + 2 * sizeof(struct opaque_auth);
942                 (*msglen) += xdr_sizeof(xdr_results, xdr_location);
943         }
944 
945         if (*msglen > RPC_MSG_SZ) {
946                 cmn_err(CE_NOTE,
947                         "svc_compose_rpcmsg: Server needs to send a reply"
948                         "larger than RPC_MSG_SZ\n");
949                 return SVC_RDMA_FAIL;
950         }
951 
952         /*
953          * Get a pre-allocated buffer for rpc reply
954          */
955         rpcreply->type = SEND_BUFFER;
956         if (RDMA_BUF_ALLOC(conn, rpcreply)) {
957                 cmn_err(CE_WARN, "svc_compose_rpcmsg: no free buffers!");
958                 return SVC_RDMA_FAIL;
959         }
960 
961         if (has_args == FALSE || auth_flavor != RPCSEC_GSS) {
962                 xdrrdma_create(*xdrs, rpcreply->addr, rpcreply->len,
963                                rdma_minchunk, NULL, XDR_ENCODE, NULL);
964         }
965         else {
966                 xdrrdma_create(*xdrs, rpcreply->addr, *msglen, 0, NULL,
967                                XDR_ENCODE, NULL);
968         }
969 
970         msg->rm_xid = clone_xprt->xp_xid;
971 
972         if (has_args) {
973                 if (!(xdr_replymsg(*xdrs, msg) &&
974                       (!has_args
975                        || SVCAUTH_WRAP(&clone_xprt->xp_auth, *xdrs,
976                                        xdr_results, xdr_location)))) {
977                         if (auth_flavor == RPCSEC_GSS
978                             && rpcreply->addr != (*xdrs)->x_base) {
979                                 rpcreply->addr = (*xdrs)->x_base;
980                                 rpcreply->len = xdr_getbufsize(*xdrs);
981                         }
982                         rdma_buf_free(conn, rpcreply);
983                         cmn_err(CE_WARN,
984                                 "svc_compose_rpcmsg: xdr_replymsg/SVCAUTH_WRAP "
985                                 "failed");
986                         return SVC_RDMA_FAIL;
987                 }
988                 if (auth_flavor == RPCSEC_GSS
989                     && rpcreply->addr != (*xdrs)->x_base) {
990                         rpcreply->addr = (*xdrs)->x_base;
991                         rpcreply->len = xdr_getbufsize(*xdrs);
992                 }
993         }
994         else {
995                 if (!xdr_replymsg(*xdrs, msg)) {
996                         rdma_buf_free(conn, rpcreply);
997                         cmn_err(CE_WARN,
998                                 "svc_compose_rpcmsg: xdr_replymsg/SVCAUTH_WRAP "
999                                 "failed");
1000                         return SVC_RDMA_FAIL;
1001                 }
1002         }
1003 
1004         *len = XDR_GETPOS(*xdrs);
1005 
1006         if (auth_flavor == RPCSEC_GSS) {
1007                 XDR_DESTROY(*xdrs);
1008                 xdrrdma_create(*xdrs, rpcreply->addr, *len, 0, NULL,
1009                                XDR_ENCODE, NULL);
1010         }
1011 
1012         return SVC_RDMA_SUCCESS;
1013 }
1014 
1015 /*
1016  * Send rpc reply.
1017  */
1018 static bool_t
1019 svc_rdma_ksend(SVCXPRT * clone_xprt, struct rpc_msg *msg)
1020 {
1021         XDR *xdrs = &(clone_xprt->xp_xdrout);
1022         XDR rxdrs;
1023         CONN *conn = NULL;
1024         rdma_buf_t clmsg = {0}, rpcreply = {0};
1025 
1026         struct clone_rdma_data *vd;
1027         struct clist *cl = NULL;
1028         struct clist *sendlist = NULL;
1029         struct clist *wcl = NULL;
1030         struct clist *reply_cl;
1031         xdrproc_t xdr_results;
1032         caddr_t xdr_location;
1033 
1034         int retval = FALSE;
1035         int status, msglen, num_wsegment = 0, num_wreply_segments = 0;
1036         uint32_t rdma_credit = 0, templen = 0;
1037         int freelen =0;
1038         bool_t has_args;
1039         uint_t  len, op, vers;
1040 
1041         vd = (struct clone_rdma_data *) clone_xprt->xp_p2buf;
1042         conn = vd->conn;
1043 
1044         /*
1045          * If there is a result procedure specified in the reply message,
1046          * it will be processed in the xdr_replymsg and SVCAUTH_WRAP.
1047          * We need to make sure it won't be processed twice, so we null
1048          * it for xdr_replymsg here.
1049          */
1050         has_args = FALSE;
1051         if (msg->rm_reply.rp_stat == MSG_ACCEPTED &&
1052             msg->rm_reply.rp_acpt.ar_stat == SUCCESS) {
1053                 if ((xdr_results = msg->acpted_rply.ar_results.proc) != NULL) {
1054                         has_args = TRUE;
1055                         xdr_location = msg->acpted_rply.ar_results.where;
1056                         msg->acpted_rply.ar_results.proc = xdr_void;
1057                         msg->acpted_rply.ar_results.where = NULL;
1058                 }
1059         }
1060 
1061         /*
1062          * Use RDMA Write to return content requested by wlist.
1063          * Only 1 writechunk in writelist for now,
1064          * but this chunk can contain multiple rdma segments.
1065          */
1066         if (vd->wlist) {
1067                 status = svc_process_wlist(vd, xdr_results, xdr_location,
1068                                            &num_wsegment, &templen);
1069                 if (status != SVC_RDMA_SUCCESS) {
1070                         goto out;
1071                 }
1072         }
1073 #ifdef RPC_RDMA_INLINE
1074         else if (xdr_results == x_READ3res) {
1075                 READ3resok *rok;
1076                 rok = &(((READ3res *) xdr_location)->res_u.ok);
1077                 rok->wlist = NULL;
1078          }
1079 #endif
1080 
1081         /*
1082          * Get the size of the rpc reply message. 
1083          */
1084         msglen = xdr_sizeof(xdr_replymsg, msg);
1085 
1086         /*reply_cl.c_daddr = NULL;*/
1087                 reply_cl = vd->reply_cl;
1088                 
1089                 if (vd->reply_cl) {
1090                 
1091         status = svc_process_long_reply(clone_xprt,
1092                                                 conn, xdr_results,
1093                                                 xdr_location, (caddr_t)vd->reply_cl,
1094                                                 &xdrs, msg, has_args,
1095                                                 &msglen, &freelen, &num_wreply_segments,  &len);
1096                 if (status == SVC_RDMA_SUCCESS) {
1097                         op = RDMA_NOMSG;
1098                         cl = NULL;
1099                         goto rdma_writed_long_reply_out;
1100                 }
1101                 else
1102                         goto out;
1103         }
1104         status = svc_compose_rpcmsg(clone_xprt, conn, xdr_results,
1105                                     xdr_location, &rpcreply, &xdrs, msg,
1106                                     has_args, &msglen, &len);
1107         if (status != SVC_RDMA_SUCCESS)
1108                 goto out;
1109 
1110         op = RDMA_MSG;
1111 
1112         cl = xdrrdma_clist(xdrs);
1113         cl = NULL;
1114         if (cl != NULL) {
1115                 cmn_err(CE_NOTE,
1116                         "svc_rdma_ksend: Should not provide non-null"
1117                         "read chunk list to client\n");
1118         }
1119 
1120 rdma_writed_long_reply_out:
1121 
1122         clmsg.type = SEND_BUFFER;
1123         if (RDMA_BUF_ALLOC(conn, &clmsg)) {
1124                 rdma_buf_free(conn, &rpcreply);
1125                 cmn_err(CE_WARN, "svc_rdma_ksend: no free buffers!!");
1126                 goto out;
1127         }
1128 
1129 #ifdef DYNAMIC_CREDIT_CONTROL
1130         svc_grant_credit(conn, &rdma_credit);
1131 #else
1132         rdma_credit = rdma_bufs_granted;        
1133 #endif
1134 
1135         vers = RPCRDMA_VERS;
1136         xdrs = &rxdrs;
1137         xdrmem_create(xdrs, clmsg.addr, clmsg.len, XDR_ENCODE);
1138         (*(uint32_t *) clmsg.addr) = msg->rm_xid;
1139         /* Skip xid and set the xdr position accordingly. */
1140         XDR_SETPOS(xdrs, sizeof(uint32_t));
1141         if (!xdr_u_int(xdrs, &vers) ||
1142             !xdr_u_int(xdrs, &rdma_credit) || !xdr_u_int(xdrs, &op)) {
1143                 rdma_buf_free(conn, &rpcreply);
1144                 rdma_buf_free(conn, &clmsg);
1145                 cmn_err(CE_WARN, "svc_rdma_ksend: xdr_u_int failed");
1146                 goto out;
1147         }
1148 
1149         /*
1150          * Now XDR the read chunk list, actually always NULL
1151          */
1152         (void) xdr_do_clist(xdrs, &cl);
1153 
1154         /*
1155          * encode write list -- we already drove RDMA_WRITEs
1156          */
1157         wcl = vd->wlist;
1158         if (!xdr_encode_wlist(xdrs, wcl, num_wsegment)) {
1159                 cmn_err(CE_NOTE,
1160                         "svc_rdma_ksend: xdr_encode_wlist failed: "
1161                         "wcl=%p", (void *) wcl);
1162                 rdma_buf_free(conn, &rpcreply);
1163                 rdma_buf_free(conn, &clmsg);
1164                 goto out;
1165         }
1166 
1167         /*
1168          * XDR encode the RDMA_REPLY write chunk
1169          */
1170         (void) xdr_encode_reply_wchunk(xdrs, vd->reply_cl, num_wreply_segments);
1171 
1172         clist_add(&sendlist, 0, XDR_GETPOS(xdrs), &clmsg.handle, clmsg.addr,
1173                   NULL, NULL);
1174 
1175         if (op == RDMA_MSG) {
1176                 clist_add(&sendlist, 0, len, &rpcreply.handle,
1177                           rpcreply.addr, NULL, NULL);
1178         }
1179 
1180 #if defined(ASYNC_SERVER_DEREG)
1181         status = RDMA_SEND_NW(conn, sendlist, msg->rm_xid, (caddr_t)conn, 
1182                                  (caddr_t)vd->wlist, 
1183                                  templen, 
1184                                  (caddr_t)reply_cl, 
1185                                  freelen, num_wsegment, num_wreply_segments );
1186 #else
1187         status = RDMA_SEND(conn, sendlist, msg->rm_xid);
1188 #endif
1189         if (status != RDMA_SUCCESS) {
1190                 goto out;
1191         }
1192 
1193         retval = TRUE;
1194 
1195 out:
1196 
1197         /*
1198          * Free up sendlist chunks
1199          */
1200         if (sendlist != NULL)
1201                 clist_free(sendlist);
1202 
1203         /*
1204          * Destroy private data for xdr rdma
1205          */
1206         if ((clone_xprt->xp_xdrout).x_private)
1207                 XDR_DESTROY(&(clone_xprt->xp_xdrout));
1208         if (rxdrs.x_private)
1209                 XDR_DESTROY(&rxdrs);
1210 
1211         /*
1212          * This is completely disgusting.  If public is set it is
1213          * a pointer to a structure whose first field is the address
1214          * of the function to free that structure and any related
1215          * stuff.  (see rrokfree in nfs_xdr.c).
1216          */
1217         if (xdrs->x_public) {
1218                 /* LINTED pointer alignment */
1219                 (**((int (**)()) xdrs->x_public)) (xdrs->x_public);
1220         }
1221 
1222         if (vd->wlist != NULL) {
1223 #if defined(ASYNC_SERVER_DEREG)
1224         if(!retval) {
1225 #endif
1226                 wcl = vd->wlist;
1227                 (void) clist_deregister(vd->conn, wcl, TRUE);
1228 #ifdef  SERVER_REG_CACHE
1229                 RDMA_FREE_SERVER_CACHE_BUF(vd->conn, (rib_lrc_entry_t *)wcl->long_reply_buf);
1230 #else
1231                 if(templen)
1232                 kmem_free((void *) (vd->wlist)->c_saddr, templen);
1233 #endif
1234                 kmem_free(vd->wlist, num_wsegment * sizeof(struct clist));
1235 #if defined(ASYNC_SERVER_DEREG)
1236         }
1237 #endif
1238                 vd->wlist = NULL;
1239         }
1240 
1241 
1242         if(vd->reply_cl != NULL){
1243 #if defined(ASYNC_SERVER_DEREG)
1244         if(!retval) {
1245 #endif
1246                 (void) clist_deregister(conn, reply_cl, TRUE);
1247 #ifdef SERVER_REG_CACHE
1248         RDMA_FREE_SERVER_CACHE_BUF(conn, (rib_lrc_entry_t *)reply_cl->long_reply_buf);
1249 #else
1250 #ifdef DEBUG
1251                 if(rdma_svc_debug > 1)
1252                 cmn_err(CE_NOTE, "Freeing up %p of length %d\n",reply_cl->c_saddr,freelen);
1253 #endif
1254                 if(freelen)
1255                 kmem_free((void *)(reply_cl->c_saddr), freelen);
1256 #endif
1257                 kmem_free((void *)vd->reply_cl, num_wreply_segments * sizeof(struct clist));
1258 #if defined(ASYNC_SERVER_DEREG)
1259                 }
1260 #endif
1261                 vd->reply_cl = NULL;
1262         }                          
1263         return (retval);
1264 }
1265 
1266 /*
1267  * Deserialize arguments.
1268  */
1269 static bool_t
1270 svc_rdma_kgetargs(SVCXPRT *clone_xprt, xdrproc_t xdr_args, caddr_t args_ptr)
1271 {
1272         if ((SVCAUTH_UNWRAP(&clone_xprt->xp_auth, &clone_xprt->xp_xdrin,
1273             xdr_args, args_ptr)) != TRUE)
1274                 return (FALSE);
1275         return (TRUE);
1276 }
1277 
1278 static bool_t
1279 svc_rdma_kfreeargs(SVCXPRT *clone_xprt, xdrproc_t xdr_args,
1280     caddr_t args_ptr)
1281 {
1282         struct clone_rdma_data *vd;
1283         bool_t retval;
1284 
1285         vd = (struct clone_rdma_data *)clone_xprt->xp_p2buf;
1286         if (args_ptr) {
1287                 XDR     *xdrs = &clone_xprt->xp_xdrin;
1288                 struct clist *cl;
1289 
1290                 cl = xdrrdma_clist(xdrs);
1291                 if (cl != NULL)
1292                         clist_free(cl);
1293 
1294                 xdrs->x_op = XDR_FREE;
1295                 retval = (*xdr_args)(xdrs, args_ptr);
1296         }
1297         XDR_DESTROY(&(clone_xprt->xp_xdrin));
1298         rdma_buf_free(vd->conn, &vd->rpcbuf);
1299         RDMA_REL_CONN(vd->conn);
1300         return (retval);
1301 }
1302 
1303 /* ARGSUSED */
1304 static int32_t *
1305 svc_rdma_kgetres(SVCXPRT *clone_xprt, int size)
1306 {
1307         return (NULL);
1308 }
1309 
1310 /* ARGSUSED */
1311 static void
1312 svc_rdma_kfreeres(SVCXPRT *clone_xprt)
1313 {
1314 }
1315 
1316 /*
1317  * the dup cacheing routines below provide a cache of non-failure
1318  * transaction id's.  rpc service routines can use this to detect
1319  * retransmissions and re-send a non-failure response.
1320  */
1321 
1322 /*
1323  * MAXDUPREQS is the number of cached items.  It should be adjusted
1324  * to the service load so that there is likely to be a response entry
1325  * when the first retransmission comes in.
1326  */
1327 #define MAXDUPREQS      1024
1328 
1329 /*
1330  * This should be appropriately scaled to MAXDUPREQS.
1331  */
1332 #define DRHASHSZ        257
1333 
1334 #if ((DRHASHSZ & (DRHASHSZ - 1)) == 0)
1335 #define XIDHASH(xid)    ((xid) & (DRHASHSZ - 1))
1336 #else
1337 #define XIDHASH(xid)    ((xid) % DRHASHSZ)
1338 #endif
1339 #define DRHASH(dr)      XIDHASH((dr)->dr_xid)
1340 #define REQTOXID(req)   ((req)->rq_xprt->xp_xid)
1341 
1342 static int      rdmandupreqs = 0;
1343 static int      rdmamaxdupreqs = MAXDUPREQS;
1344 static kmutex_t rdmadupreq_lock;
1345 static struct dupreq *rdmadrhashtbl[DRHASHSZ];
1346 static int      rdmadrhashstat[DRHASHSZ];
1347 
1348 static void unhash(struct dupreq *);
1349 
1350 /*
1351  * rdmadrmru points to the head of a circular linked list in lru order.
1352  * rdmadrmru->dr_next == drlru
1353  */
1354 struct dupreq *rdmadrmru;
1355 
1356 /*
1357  * svc_rdma_kdup searches the request cache and returns 0 if the
1358  * request is not found in the cache.  If it is found, then it
1359  * returns the state of the request (in progress or done) and
1360  * the status or attributes that were part of the original reply.
1361  */
1362 static int
1363 svc_rdma_kdup(struct svc_req *req, caddr_t res, int size, struct dupreq **drpp,
1364         bool_t *dupcachedp)
1365 {
1366         struct dupreq *dr;
1367         uint32_t xid;
1368         uint32_t drhash;
1369         int status;
1370 
1371         xid = REQTOXID(req);
1372         mutex_enter(&rdmadupreq_lock);
1373         RSSTAT_INCR(rsdupchecks);
1374         /*
1375          * Check to see whether an entry already exists in the cache.
1376          */
1377         dr = rdmadrhashtbl[XIDHASH(xid)];
1378         while (dr != NULL) {
1379                 if (dr->dr_xid == xid &&
1380                     dr->dr_proc == req->rq_proc &&
1381                     dr->dr_prog == req->rq_prog &&
1382                     dr->dr_vers == req->rq_vers &&
1383                     dr->dr_addr.len == req->rq_xprt->xp_rtaddr.len &&
1384                     bcmp((caddr_t)dr->dr_addr.buf,
1385                     (caddr_t)req->rq_xprt->xp_rtaddr.buf,
1386                     dr->dr_addr.len) == 0) {
1387                         status = dr->dr_status;
1388                         if (status == DUP_DONE) {
1389                                 bcopy(dr->dr_resp.buf, res, size);
1390                                 if (dupcachedp != NULL)
1391                                         *dupcachedp = (dr->dr_resfree != NULL);
1392                         } else {
1393                                 dr->dr_status = DUP_INPROGRESS;
1394                                 *drpp = dr;
1395                         }
1396                         RSSTAT_INCR(rsdupreqs);
1397                         mutex_exit(&rdmadupreq_lock);
1398                         return (status);
1399                 }
1400                 dr = dr->dr_chain;
1401         }
1402 
1403         /*
1404          * There wasn't an entry, either allocate a new one or recycle
1405          * an old one.
1406          */
1407         if (rdmandupreqs < rdmamaxdupreqs) {
1408                 dr = kmem_alloc(sizeof (*dr), KM_NOSLEEP);
1409                 if (dr == NULL) {
1410                         mutex_exit(&rdmadupreq_lock);
1411                         return (DUP_ERROR);
1412                 }
1413                 dr->dr_resp.buf = NULL;
1414                 dr->dr_resp.maxlen = 0;
1415                 dr->dr_addr.buf = NULL;
1416                 dr->dr_addr.maxlen = 0;
1417                 if (rdmadrmru) {
1418                         dr->dr_next = rdmadrmru->dr_next;
1419                         rdmadrmru->dr_next = dr;
1420                 } else {
1421                         dr->dr_next = dr;
1422                 }
1423                 rdmandupreqs++;
1424         } else {
1425                 dr = rdmadrmru->dr_next;
1426                 while (dr->dr_status == DUP_INPROGRESS) {
1427                         dr = dr->dr_next;
1428                         if (dr == rdmadrmru->dr_next) {
1429                                 cmn_err(CE_WARN, "svc_rdma_kdup no slots free");
1430                                 mutex_exit(&rdmadupreq_lock);
1431                                 return (DUP_ERROR);
1432                         }
1433                 }
1434                 unhash(dr);
1435                 if (dr->dr_resfree) {
1436                         (*dr->dr_resfree)(dr->dr_resp.buf);
1437                 }
1438         }
1439         dr->dr_resfree = NULL;
1440         rdmadrmru = dr;
1441 
1442         dr->dr_xid = REQTOXID(req);
1443         dr->dr_prog = req->rq_prog;
1444         dr->dr_vers = req->rq_vers;
1445         dr->dr_proc = req->rq_proc;
1446         if (dr->dr_addr.maxlen < req->rq_xprt->xp_rtaddr.len) {
1447                 if (dr->dr_addr.buf != NULL)
1448                         kmem_free(dr->dr_addr.buf, dr->dr_addr.maxlen);
1449                 dr->dr_addr.maxlen = req->rq_xprt->xp_rtaddr.len;
1450                 dr->dr_addr.buf = kmem_alloc(dr->dr_addr.maxlen, KM_NOSLEEP);
1451                 if (dr->dr_addr.buf == NULL) {
1452                         dr->dr_addr.maxlen = 0;
1453                         dr->dr_status = DUP_DROP;
1454                         mutex_exit(&rdmadupreq_lock);
1455                         return (DUP_ERROR);
1456                 }
1457         }
1458         dr->dr_addr.len = req->rq_xprt->xp_rtaddr.len;
1459         bcopy(req->rq_xprt->xp_rtaddr.buf, dr->dr_addr.buf, dr->dr_addr.len);
1460         if (dr->dr_resp.maxlen < size) {
1461                 if (dr->dr_resp.buf != NULL)
1462                         kmem_free(dr->dr_resp.buf, dr->dr_resp.maxlen);
1463                 dr->dr_resp.maxlen = (unsigned int)size;
1464                 dr->dr_resp.buf = kmem_alloc(size, KM_NOSLEEP);
1465                 if (dr->dr_resp.buf == NULL) {
1466                         dr->dr_resp.maxlen = 0;
1467                         dr->dr_status = DUP_DROP;
1468                         mutex_exit(&rdmadupreq_lock);
1469                         return (DUP_ERROR);
1470                 }
1471         }
1472         dr->dr_status = DUP_INPROGRESS;
1473 
1474         drhash = (uint32_t)DRHASH(dr);
1475         dr->dr_chain = rdmadrhashtbl[drhash];
1476         rdmadrhashtbl[drhash] = dr;
1477         rdmadrhashstat[drhash]++;
1478         mutex_exit(&rdmadupreq_lock);
1479         *drpp = dr;
1480         return (DUP_NEW);
1481 }
1482 
1483 /*
1484  * svc_rdma_kdupdone marks the request done (DUP_DONE or DUP_DROP)
1485  * and stores the response.
1486  */
1487 static void
1488 svc_rdma_kdupdone(struct dupreq *dr, caddr_t res, void (*dis_resfree)(),
1489         int size, int status)
1490 {
1491         ASSERT(dr->dr_resfree == NULL);
1492         if (status == DUP_DONE) {
1493                 bcopy(res, dr->dr_resp.buf, size);
1494                 dr->dr_resfree = dis_resfree;
1495         }
1496         dr->dr_status = status;
1497 }
1498 
1499 /*
1500  * This routine expects that the mutex, rdmadupreq_lock, is already held.
1501  */
1502 static void
1503 unhash(struct dupreq *dr)
1504 {
1505         struct dupreq *drt;
1506         struct dupreq *drtprev = NULL;
1507         uint32_t drhash;
1508 
1509         ASSERT(MUTEX_HELD(&rdmadupreq_lock));
1510 
1511         drhash = (uint32_t)DRHASH(dr);
1512         drt = rdmadrhashtbl[drhash];
1513         while (drt != NULL) {
1514                 if (drt == dr) {
1515                         rdmadrhashstat[drhash]--;
1516                         if (drtprev == NULL) {
1517                                 rdmadrhashtbl[drhash] = drt->dr_chain;
1518                         } else {
1519                                 drtprev->dr_chain = drt->dr_chain;
1520                         }
1521                         return;
1522                 }
1523                 drtprev = drt;
1524                 drt = drt->dr_chain;
1525         }
1526 }
1527 
1528 bool_t
1529 rdma_get_wchunk_seg(struct svc_req *req, iovec_t *iov)
1530 {
1531        struct clone_rdma_data *rcd;
1532        struct clist           *clist;
1533        uint32_t        tlen;
1534 
1535        if (req->rq_xprt->xp_type != T_RDMA) {
1536                return (FALSE);
1537        }
1538 
1539        rcd = (struct clone_rdma_data *)(&req->rq_xprt->xp_p2buf);
1540        if (rcd->wlist == NULL) {
1541                return (FALSE);
1542        }
1543        tlen = 0;
1544        clist = rcd->wlist;
1545        while(clist){
1546         tlen += clist->c_len;        
1547         clist = clist->c_next;
1548         }
1549 
1550        /*
1551         * set iov to addr+len of first segment of first wchunk of
1552         * wlist sent by client.  krecv() already malloc'd a buffer
1553         * large enough, but registration is deferred until we write
1554         * the buffer back to (NFS) client using RDMA_WRITE.
1555         */
1556        iov->iov_base = (caddr_t)rcd->wlist->c_saddr;
1557        iov->iov_len = tlen;   
1558 
1559        return (TRUE);
1560 }
1561