New nfs_client.c
  1 /*
  2  * CDDL HEADER START
  3  *
  4  * The contents of this file are subject to the terms of the
  5  * Common Development and Distribution License, Version 1.0 only
  6  * (the "License").  You may not use this file except in compliance
  7  * with the License.
  8  *
  9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 10  * or http://www.opensolaris.org/os/licensing.
 11  * See the License for the specific language governing permissions
 12  * and limitations under the License.
 13  *
 14  * When distributing Covered Code, include this CDDL HEADER in each
 15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 16  * If applicable, add the following below this CDDL HEADER, with the
 17  * fields enclosed by brackets "[]" replaced with your own identifying
 18  * information: Portions Copyright [yyyy] [name of copyright owner]
 19  *
 20  * CDDL HEADER END
 21  */
 22 /*
 23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
 24  * Use is subject to license terms.
 25  *
 26  *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
 27  *      All rights reserved.
 28  */
 29 
 30  /* Copyright (c) 2006, The Ohio State University. All rights reserved.
 31   *
 32   * Portions of this source code is developed by the team members of
 33   * The Ohio State University's Network-Based Computing Laboratory (NBCL),
 34   * headed by Professor Dhabaleswar K. (DK) Panda.
 35   *
 36   * Acknowledgements to contributions from developors:
 37   *   Ranjit Noronha: noronha@cse.ohio-state.edu
 38   *   Lei Chai      : chail@cse.ohio-state.edu
 39   *   Weikuan Yu    : yuw@cse.ohio-state.edu
 40   *
 41   */
 42 
 43 #pragma ident   "@(#)nfs_client.c       1.193   05/10/11 SMI"
 44 
 45 #include <sys/param.h>
 46 #include <sys/types.h>
 47 #include <sys/systm.h>
 48 #include <sys/thread.h>
 49 #include <sys/t_lock.h>
 50 #include <sys/time.h>
 51 #include <sys/vnode.h>
 52 #include <sys/vfs.h>
 53 #include <sys/errno.h>
 54 #include <sys/buf.h>
 55 #include <sys/stat.h>
 56 #include <sys/cred.h>
 57 #include <sys/kmem.h>
 58 #include <sys/debug.h>
 59 #include <sys/dnlc.h>
 60 #include <sys/vmsystm.h>
 61 #include <sys/flock.h>
 62 #include <sys/share.h>
 63 #include <sys/cmn_err.h>
 64 #include <sys/tiuser.h>
 65 #include <sys/sysmacros.h>
 66 #include <sys/callb.h>
 67 #include <sys/acl.h>
 68 #include <sys/kstat.h>
 69 #include <sys/signal.h>
 70 #include <sys/list.h>
 71 #include <sys/zone.h>
 72 
 73 #include <rpc/types.h>
 74 #include <rpc/xdr.h>
 75 #include <rpc/auth.h>
 76 #include <rpc/clnt.h>
 77 
 78 #include <nfs/nfs.h>
 79 #include <nfs/nfs_clnt.h>
 80 
 81 #include <nfs/rnode.h>
 82 #include <nfs/nfs_acl.h>
 83 #include <nfs/lm.h>
 84 
 85 #include <vm/hat.h>
 86 #include <vm/as.h>
 87 #include <vm/page.h>
 88 #include <vm/pvn.h>
 89 #include <vm/seg.h>
 90 #include <vm/seg_map.h>
 91 #include <vm/seg_vn.h>
 92 
 93 static void     nfs3_attr_cache(vnode_t *, vattr_t *, vattr_t *, hrtime_t,
 94                         cred_t *);
 95 static int      nfs_getattr_cache(vnode_t *, struct vattr *);
 96 static int      nfs_remove_locking_id(vnode_t *, int, char *, char *, int *);
 97 
 98 struct mi_globals {
 99         kmutex_t        mig_lock;  /* lock protecting mig_list */
100         list_t          mig_list;  /* list of NFS v2 or v3 mounts in zone */
101         boolean_t       mig_destructor_called;
102 };
103 
104 static zone_key_t mi_list_key;
105 
106 /* Debugging flag for PC file shares. */
107 extern int      share_debug;
108 
109 /*
110  * used by RDMA transport to easily recognize READ3 call/reply
111  * (FTDO -- for the demo only.  Better design needed for NFS4 or ON10 putback)
112  */
113 
114 extern xdrproc_t x_READ3args;
115 extern xdrproc_t x_READ3res;
116 extern xdrproc_t x_READ3uiores;
117 extern xdrproc_t x_READ3vres;
118 
119 /*
120  * Attributes caching:
121  *
122  * Attributes are cached in the rnode in struct vattr form.
123  * There is a time associated with the cached attributes (r_attrtime)
124  * which tells whether the attributes are valid. The time is initialized
125  * to the difference between current time and the modify time of the vnode
126  * when new attributes are cached. This allows the attributes for
127  * files that have changed recently to be timed out sooner than for files
128  * that have not changed for a long time. There are minimum and maximum
129  * timeout values that can be set per mount point.
130  */
131 
132 int
133 nfs_waitfor_purge_complete(vnode_t *vp)
134 {
135         rnode_t *rp;
136         k_sigset_t smask;
137 
138         rp = VTOR(vp);
139         if (rp->r_serial != NULL && rp->r_serial != curthread) {
140                 mutex_enter(&rp->r_statelock);
141                 sigintr(&smask, VTOMI(vp)->mi_flags & MI_INT);
142                 while (rp->r_serial != NULL) {
143                         if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
144                                 sigunintr(&smask);
145                                 mutex_exit(&rp->r_statelock);
146                                 return (EINTR);
147                         }
148                 }
149                 sigunintr(&smask);
150                 mutex_exit(&rp->r_statelock);
151         }
152         return (0);
153 }
154 
155 /*
156  * Validate caches by checking cached attributes. If the cached
157  * attributes have timed out, then get new attributes from the server.
158  * As a side affect, this will do cache invalidation if the attributes
159  * have changed.
160  *
161  * If the attributes have not timed out and if there is a cache
162  * invalidation being done by some other thread, then wait until that
163  * thread has completed the cache invalidation.
164  */
165 int
166 nfs_validate_caches(vnode_t *vp, cred_t *cr)
167 {
168         int error;
169         struct vattr va;
170 
171         if (ATTRCACHE_VALID(vp)) {
172                 error = nfs_waitfor_purge_complete(vp);
173                 if (error)
174                         return (error);
175                 return (0);
176         }
177 
178         va.va_mask = AT_ALL;
179         return (nfs_getattr_otw(vp, &va, cr));
180 }
181 
182 /*
183  * Validate caches by checking cached attributes. If the cached
184  * attributes have timed out, then get new attributes from the server.
185  * As a side affect, this will do cache invalidation if the attributes
186  * have changed.
187  *
188  * If the attributes have not timed out and if there is a cache
189  * invalidation being done by some other thread, then wait until that
190  * thread has completed the cache invalidation.
191  */
192 int
193 nfs3_validate_caches(vnode_t *vp, cred_t *cr)
194 {
195         int error;
196         struct vattr va;
197 
198         if (ATTRCACHE_VALID(vp)) {
199                 error = nfs_waitfor_purge_complete(vp);
200                 if (error)
201                         return (error);
202                 return (0);
203         }
204 
205         va.va_mask = AT_ALL;
206         return (nfs3_getattr_otw(vp, &va, cr));
207 }
208 
209 /*
210  * Purge all of the various NFS `data' caches.
211  */
212 void
213 nfs_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr)
214 {
215         rnode_t *rp;
216         char *contents;
217         int size;
218         int error;
219 
220         /*
221          * Purge the DNLC for any entries which refer to this file.
222          * Avoid recursive entry into dnlc_purge_vp() in case of a directory.
223          */
224         rp = VTOR(vp);
225         mutex_enter(&rp->r_statelock);
226         if (vp->v_count > 1 &&
227             (vp->v_type == VDIR || purge_dnlc == NFS_PURGE_DNLC) &&
228             !(rp->r_flags & RINDNLCPURGE)) {
229                 /*
230                  * Set the RINDNLCPURGE flag to prevent recursive entry
231                  * into dnlc_purge_vp()
232                  */
233                 if (vp->v_type == VDIR)
234                         rp->r_flags |= RINDNLCPURGE;
235                 mutex_exit(&rp->r_statelock);
236                 dnlc_purge_vp(vp);
237                 mutex_enter(&rp->r_statelock);
238                 if (rp->r_flags & RINDNLCPURGE)
239                         rp->r_flags &= ~RINDNLCPURGE;
240         }
241 
242         /*
243          * Clear any readdir state bits and purge the readlink response cache.
244          */
245         contents = rp->r_symlink.contents;
246         size = rp->r_symlink.size;
247         rp->r_symlink.contents = NULL;
248         mutex_exit(&rp->r_statelock);
249 
250         if (contents != NULL) {
251 
252                 kmem_free((void *)contents, size);
253         }
254 
255         /*
256          * Flush the page cache.
257          */
258         if (vn_has_cached_data(vp)) {
259                 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr);
260                 if (error && (error == ENOSPC || error == EDQUOT)) {
261                         mutex_enter(&rp->r_statelock);
262                         if (!rp->r_error)
263                                 rp->r_error = error;
264                         mutex_exit(&rp->r_statelock);
265                 }
266         }
267 
268         /*
269          * Flush the readdir response cache.
270          */
271         if (HAVE_RDDIR_CACHE(rp))
272                 nfs_purge_rddir_cache(vp);
273 }
274 
275 /*
276  * Purge the readdir cache of all entries
277  */
278 void
279 nfs_purge_rddir_cache(vnode_t *vp)
280 {
281         rnode_t *rp;
282         rddir_cache *rdc;
283         rddir_cache *nrdc;
284 
285         rp = VTOR(vp);
286 top:
287         mutex_enter(&rp->r_statelock);
288         rp->r_direof = NULL;
289         rp->r_flags &= ~RLOOKUP;
290         rp->r_flags |= RREADDIRPLUS;
291         rdc = avl_first(&rp->r_dir);
292         while (rdc != NULL) {
293                 nrdc = AVL_NEXT(&rp->r_dir, rdc);
294                 avl_remove(&rp->r_dir, rdc);
295                 rddir_cache_rele(rdc);
296                 rdc = nrdc;
297         }
298         mutex_exit(&rp->r_statelock);
299 }
300 
301 /*
302  * Do a cache check based on the post-operation attributes.
303  * Then make them the new cached attributes.  If no attributes
304  * were returned, then mark the attributes as timed out.
305  */
306 void
307 nfs3_cache_post_op_attr(vnode_t *vp, post_op_attr *poap, hrtime_t t, cred_t *cr)
308 {
309         vattr_t attr;
310 
311         if (!poap->attributes) {
312                 PURGE_ATTRCACHE(vp);
313                 return;
314         }
315         (void) nfs3_cache_fattr3(vp, &poap->attr, &attr, t, cr);
316 }
317 
318 /*
319  * Same as above, but using a vattr
320  */
321 void
322 nfs3_cache_post_op_vattr(vnode_t *vp, post_op_vattr *poap, hrtime_t t,
323     cred_t *cr)
324 {
325         if (!poap->attributes) {
326                 PURGE_ATTRCACHE(vp);
327                 return;
328         }
329         nfs_attr_cache(vp, poap->fres.vap, t, cr);
330 }
331 
332 /*
333  * Do a cache check based on the weak cache consistency attributes.
334  * These consist of a small set of pre-operation attributes and the
335  * full set of post-operation attributes.
336  *
337  * If we are given the pre-operation attributes, then use them to
338  * check the validity of the various caches.  Then, if we got the
339  * post-operation attributes, make them the new cached attributes.
340  * If we didn't get the post-operation attributes, then mark the
341  * attribute cache as timed out so that the next reference will
342  * cause a GETATTR to the server to refresh with the current
343  * attributes.
344  *
345  * Otherwise, if we didn't get the pre-operation attributes, but
346  * we did get the post-operation attributes, then use these
347  * attributes to check the validity of the various caches.  This
348  * will probably cause a flush of the caches because if the
349  * operation succeeded, the attributes of the object were changed
350  * in some way from the old post-operation attributes.  This
351  * should be okay because it is the safe thing to do.  After
352  * checking the data caches, then we make these the new cached
353  * attributes.
354  *
355  * Otherwise, we didn't get either the pre- or post-operation
356  * attributes.  Simply mark the attribute cache as timed out so
357  * the next reference will cause a GETATTR to the server to
358  * refresh with the current attributes.
359  *
360  * If an error occurred trying to convert the over the wire
361  * attributes to a vattr, then simply mark the attribute cache as
362  * timed out.
363  */
364 void
365 nfs3_cache_wcc_data(vnode_t *vp, wcc_data *wccp, hrtime_t t, cred_t *cr)
366 {
367         vattr_t bva;
368         vattr_t ava;
369 
370         if (wccp->after.attributes) {
371                 if (fattr3_to_vattr(vp, &wccp->after.attr, &ava)) {
372                         PURGE_ATTRCACHE(vp);
373                         return;
374                 }
375                 if (wccp->before.attributes) {
376                         bva.va_ctime.tv_sec = wccp->before.attr.ctime.seconds;
377                         bva.va_ctime.tv_nsec = wccp->before.attr.ctime.nseconds;
378                         bva.va_mtime.tv_sec = wccp->before.attr.mtime.seconds;
379                         bva.va_mtime.tv_nsec = wccp->before.attr.mtime.nseconds;
380                         bva.va_size = wccp->before.attr.size;
381                         nfs3_attr_cache(vp, &bva, &ava, t, cr);
382                 } else
383                         nfs_attr_cache(vp, &ava, t, cr);
384         } else {
385                 PURGE_ATTRCACHE(vp);
386         }
387 }
388 
389 /*
390  * Set attributes cache for given vnode using nfsattr.
391  *
392  * This routine does not do cache validation with the attributes.
393  *
394  * If an error occurred trying to convert the over the wire
395  * attributes to a vattr, then simply mark the attribute cache as
396  * timed out.
397  */
398 void
399 nfs_attrcache(vnode_t *vp, struct nfsfattr *na, hrtime_t t)
400 {
401         rnode_t *rp;
402         struct vattr va;
403 
404         if (!nattr_to_vattr(vp, na, &va)) {
405                 rp = VTOR(vp);
406                 mutex_enter(&rp->r_statelock);
407                 if (rp->r_mtime <= t)
408                         nfs_attrcache_va(vp, &va);
409                 mutex_exit(&rp->r_statelock);
410         } else {
411                 PURGE_ATTRCACHE(vp);
412         }
413 }
414 
415 /*
416  * Set attributes cache for given vnode using fattr3.
417  *
418  * This routine does not do cache validation with the attributes.
419  *
420  * If an error occurred trying to convert the over the wire
421  * attributes to a vattr, then simply mark the attribute cache as
422  * timed out.
423  */
424 void
425 nfs3_attrcache(vnode_t *vp, fattr3 *na, hrtime_t t)
426 {
427         rnode_t *rp;
428         struct vattr va;
429 
430         if (!fattr3_to_vattr(vp, na, &va)) {
431                 rp = VTOR(vp);
432                 mutex_enter(&rp->r_statelock);
433                 if (rp->r_mtime <= t)
434                         nfs_attrcache_va(vp, &va);
435                 mutex_exit(&rp->r_statelock);
436         } else {
437                 PURGE_ATTRCACHE(vp);
438         }
439 }
440 
441 /*
442  * Do a cache check based on attributes returned over the wire.  The
443  * new attributes are cached.
444  *
445  * If an error occurred trying to convert the over the wire attributes
446  * to a vattr, then just return that error.
447  *
448  * As a side affect, the vattr argument is filled in with the converted
449  * attributes.
450  */
451 int
452 nfs_cache_fattr(vnode_t *vp, struct nfsfattr *na, vattr_t *vap, hrtime_t t,
453     cred_t *cr)
454 {
455         int error;
456 
457         error = nattr_to_vattr(vp, na, vap);
458         if (error)
459                 return (error);
460         nfs_attr_cache(vp, vap, t, cr);
461         return (0);
462 }
463 
464 /*
465  * Do a cache check based on attributes returned over the wire.  The
466  * new attributes are cached.
467  *
468  * If an error occurred trying to convert the over the wire attributes
469  * to a vattr, then just return that error.
470  *
471  * As a side affect, the vattr argument is filled in with the converted
472  * attributes.
473  */
474 int
475 nfs3_cache_fattr3(vnode_t *vp, fattr3 *na, vattr_t *vap, hrtime_t t, cred_t *cr)
476 {
477         int error;
478 
479         error = fattr3_to_vattr(vp, na, vap);
480         if (error)
481                 return (error);
482         nfs_attr_cache(vp, vap, t, cr);
483         return (0);
484 }
485 
486 /*
487  * Use the passed in virtual attributes to check to see whether the
488  * data and metadata caches are valid, cache the new attributes, and
489  * then do the cache invalidation if required.
490  *
491  * The cache validation and caching of the new attributes is done
492  * atomically via the use of the mutex, r_statelock.  If required,
493  * the cache invalidation is done atomically w.r.t. the cache
494  * validation and caching of the attributes via the pseudo lock,
495  * r_serial.
496  *
497  * This routine is used to do cache validation and attributes caching
498  * for operations with a single set of post operation attributes.
499  */
500 void
501 nfs_attr_cache(vnode_t *vp, vattr_t *vap, hrtime_t t, cred_t *cr)
502 {
503         rnode_t *rp;
504         int mtime_changed;
505         int ctime_changed;
506         vsecattr_t *vsp;
507         int was_serial;
508 
509         rp = VTOR(vp);
510 
511         mutex_enter(&rp->r_statelock);
512 
513         if (rp->r_serial != curthread) {
514                 klwp_t *lwp = ttolwp(curthread);
515 
516                 was_serial = 0;
517                 if (lwp != NULL)
518                         lwp->lwp_nostop++;
519                 while (rp->r_serial != NULL) {
520                         if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
521                                 mutex_exit(&rp->r_statelock);
522                                 if (lwp != NULL)
523                                         lwp->lwp_nostop--;
524                                 return;
525                         }
526                 }
527                 if (lwp != NULL)
528                         lwp->lwp_nostop--;
529         } else
530                 was_serial = 1;
531 
532         if (rp->r_mtime > t) {
533                 mutex_exit(&rp->r_statelock);
534                 return;
535         }
536 
537         if (!(rp->r_flags & RWRITEATTR)) {
538                 if (!CACHE_VALID(rp, vap->va_mtime, vap->va_size))
539                         mtime_changed = 1;
540                 else
541                         mtime_changed = 0;
542                 if (rp->r_attr.va_ctime.tv_sec != vap->va_ctime.tv_sec ||
543                     rp->r_attr.va_ctime.tv_nsec != vap->va_ctime.tv_nsec)
544                         ctime_changed = 1;
545                 else
546                         ctime_changed = 0;
547         } else if (rp->r_size != vap->va_size &&
548                     (!vn_has_cached_data(vp) ||
549                     (!(rp->r_flags & RDIRTY) && rp->r_count == 0))) {
550                 mtime_changed = 1;
551                 ctime_changed = 0;
552         } else {
553                 mtime_changed = 0;
554                 ctime_changed = 0;
555         }
556 
557         nfs_attrcache_va(vp, vap);
558 
559         if (!mtime_changed && !ctime_changed) {
560                 mutex_exit(&rp->r_statelock);
561                 return;
562         }
563 
564         rp->r_serial = curthread;
565 
566         mutex_exit(&rp->r_statelock);
567 
568         if (mtime_changed)
569                 nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr);
570 
571         if (ctime_changed) {
572                 (void) nfs_access_purge_rp(rp);
573                 if (rp->r_secattr != NULL) {
574                         mutex_enter(&rp->r_statelock);
575                         vsp = rp->r_secattr;
576                         rp->r_secattr = NULL;
577                         mutex_exit(&rp->r_statelock);
578                         if (vsp != NULL)
579                                 nfs_acl_free(vsp);
580                 }
581         }
582 
583         if (!was_serial) {
584                 mutex_enter(&rp->r_statelock);
585                 rp->r_serial = NULL;
586                 cv_broadcast(&rp->r_cv);
587                 mutex_exit(&rp->r_statelock);
588         }
589 }
590 
591 /*
592  * Use the passed in "before" virtual attributes to check to see
593  * whether the data and metadata caches are valid, cache the "after"
594  * new attributes, and then do the cache invalidation if required.
595  *
596  * The cache validation and caching of the new attributes is done
597  * atomically via the use of the mutex, r_statelock.  If required,
598  * the cache invalidation is done atomically w.r.t. the cache
599  * validation and caching of the attributes via the pseudo lock,
600  * r_serial.
601  *
602  * This routine is used to do cache validation and attributes caching
603  * for operations with both pre operation attributes and post operation
604  * attributes.
605  */
606 static void
607 nfs3_attr_cache(vnode_t *vp, vattr_t *bvap, vattr_t *avap, hrtime_t t,
608     cred_t *cr)
609 {
610         rnode_t *rp;
611         int mtime_changed;
612         int ctime_changed;
613         vsecattr_t *vsp;
614         int was_serial;
615 
616         rp = VTOR(vp);
617 
618         mutex_enter(&rp->r_statelock);
619 
620         if (rp->r_serial != curthread) {
621                 klwp_t *lwp = ttolwp(curthread);
622 
623                 was_serial = 0;
624                 if (lwp != NULL)
625                         lwp->lwp_nostop++;
626                 while (rp->r_serial != NULL) {
627                         if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
628                                 mutex_exit(&rp->r_statelock);
629                                 if (lwp != NULL)
630                                         lwp->lwp_nostop--;
631                                 return;
632                         }
633                 }
634                 if (lwp != NULL)
635                         lwp->lwp_nostop--;
636         } else
637                 was_serial = 1;
638 
639         if (rp->r_mtime > t) {
640                 mutex_exit(&rp->r_statelock);
641                 return;
642         }
643 
644         if (!(rp->r_flags & RWRITEATTR)) {
645                 if (!CACHE_VALID(rp, bvap->va_mtime, bvap->va_size))
646                         mtime_changed = 1;
647                 else
648                         mtime_changed = 0;
649                 if (rp->r_attr.va_ctime.tv_sec != bvap->va_ctime.tv_sec ||
650                     rp->r_attr.va_ctime.tv_nsec != bvap->va_ctime.tv_nsec)
651                         ctime_changed = 1;
652                 else
653                         ctime_changed = 0;
654         } else {
655                 mtime_changed = 0;
656                 ctime_changed = 0;
657         }
658 
659         nfs_attrcache_va(vp, avap);
660 
661         if (!mtime_changed && !ctime_changed) {
662                 mutex_exit(&rp->r_statelock);
663                 return;
664         }
665 
666         rp->r_serial = curthread;
667 
668         mutex_exit(&rp->r_statelock);
669 
670         if (mtime_changed)
671                 nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr);
672 
673         if (ctime_changed) {
674                 (void) nfs_access_purge_rp(rp);
675                 if (rp->r_secattr != NULL) {
676                         mutex_enter(&rp->r_statelock);
677                         vsp = rp->r_secattr;
678                         rp->r_secattr = NULL;
679                         mutex_exit(&rp->r_statelock);
680                         if (vsp != NULL)
681                                 nfs_acl_free(vsp);
682                 }
683         }
684 
685         if (!was_serial) {
686                 mutex_enter(&rp->r_statelock);
687                 rp->r_serial = NULL;
688                 cv_broadcast(&rp->r_cv);
689                 mutex_exit(&rp->r_statelock);
690         }
691 }
692 
693 /*
694  * Set attributes cache for given vnode using virtual attributes.
695  *
696  * Set the timeout value on the attribute cache and fill it
697  * with the passed in attributes.
698  *
699  * The caller must be holding r_statelock.
700  */
701 void
702 nfs_attrcache_va(vnode_t *vp, struct vattr *va)
703 {
704         rnode_t *rp;
705         mntinfo_t *mi;
706         hrtime_t delta;
707         hrtime_t now;
708 
709         rp = VTOR(vp);
710 
711         ASSERT(MUTEX_HELD(&rp->r_statelock));
712 
713         now = gethrtime();
714 
715         mi = VTOMI(vp);
716 
717         /*
718          * Delta is the number of nanoseconds that we will
719          * cache the attributes of the file.  It is based on
720          * the number of nanoseconds since the last time that
721          * we detected a change.  The assumption is that files
722          * that changed recently are likely to change again.
723          * There is a minimum and a maximum for regular files
724          * and for directories which is enforced though.
725          *
726          * Using the time since last change was detected
727          * eliminates direct comparison or calculation
728          * using mixed client and server times.  NFS does
729          * not make any assumptions regarding the client
730          * and server clocks being synchronized.
731          */
732         if (va->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec ||
733             va->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec ||
734             va->va_size != rp->r_attr.va_size)
735                 rp->r_mtime = now;
736 
737         if ((mi->mi_flags & MI_NOAC) || (vp->v_flag & VNOCACHE))
738                 delta = 0;
739         else {
740                 delta = now - rp->r_mtime;
741                 if (vp->v_type == VDIR) {
742                         if (delta < mi->mi_acdirmin)
743                                 delta = mi->mi_acdirmin;
744                         else if (delta > mi->mi_acdirmax)
745                                 delta = mi->mi_acdirmax;
746                 } else {
747                         if (delta < mi->mi_acregmin)
748                                 delta = mi->mi_acregmin;
749                         else if (delta > mi->mi_acregmax)
750                                 delta = mi->mi_acregmax;
751                 }
752         }
753         rp->r_attrtime = now + delta;
754         rp->r_attr = *va;
755         /*
756          * Update the size of the file if there is no cached data or if
757          * the cached data is clean and there is no data being written
758          * out.
759          */
760         if (rp->r_size != va->va_size &&
761             (!vn_has_cached_data(vp) ||
762             (!(rp->r_flags & RDIRTY) && rp->r_count == 0)))
763                 rp->r_size = va->va_size;
764         nfs_setswaplike(vp, va);
765         rp->r_flags &= ~RWRITEATTR;
766 }
767 
768 /*
769  * Fill in attribute from the cache.
770  * If valid, then return 0 to indicate that no error occurred,
771  * otherwise return 1 to indicate that an error occurred.
772  */
773 static int
774 nfs_getattr_cache(vnode_t *vp, struct vattr *vap)
775 {
776         rnode_t *rp;
777 
778         rp = VTOR(vp);
779         mutex_enter(&rp->r_statelock);
780         if (ATTRCACHE_VALID(vp)) {
781                 /*
782                  * Cached attributes are valid
783                  */
784                 *vap = rp->r_attr;
785                 mutex_exit(&rp->r_statelock);
786                 return (0);
787         }
788         mutex_exit(&rp->r_statelock);
789         return (1);
790 }
791 
792 /*
793  * Get attributes over-the-wire and update attributes cache
794  * if no error occurred in the over-the-wire operation.
795  * Return 0 if successful, otherwise error.
796  */
797 int
798 nfs_getattr_otw(vnode_t *vp, struct vattr *vap, cred_t *cr)
799 {
800         int error;
801         struct nfsattrstat ns;
802         int douprintf;
803         mntinfo_t *mi;
804         failinfo_t fi;
805         hrtime_t t;
806 
807         mi = VTOMI(vp);
808         fi.vp = vp;
809         fi.fhp = NULL;          /* no need to update, filehandle not copied */
810         fi.copyproc = nfscopyfh;
811         fi.lookupproc = nfslookup;
812         fi.xattrdirproc = acl_getxattrdir2;
813 
814         if (mi->mi_flags & MI_ACL) {
815                 error = acl_getattr2_otw(vp, vap, cr);
816                 if (mi->mi_flags & MI_ACL)
817                         return (error);
818         }
819 
820         douprintf = 1;
821 
822         t = gethrtime();
823 
824         error = rfs2call(mi, RFS_GETATTR,
825                         xdr_fhandle, (caddr_t)VTOFH(vp),
826                         xdr_attrstat, (caddr_t)&ns, cr,
827                         &douprintf, &ns.ns_status, 0, &fi);
828 
829         if (!error) {
830                 error = geterrno(ns.ns_status);
831                 if (!error)
832                         error = nfs_cache_fattr(vp, &ns.ns_attr, vap, t, cr);
833                 else {
834                         PURGE_STALE_FH(error, vp, cr);
835                 }
836         }
837 
838         return (error);
839 }
840 
841 /*
842  * Return either cached ot remote attributes. If get remote attr
843  * use them to check and invalidate caches, then cache the new attributes.
844  */
845 int
846 nfsgetattr(vnode_t *vp, struct vattr *vap, cred_t *cr)
847 {
848         int error;
849         rnode_t *rp;
850 
851         /*
852          * If we've got cached attributes, we're done, otherwise go
853          * to the server to get attributes, which will update the cache
854          * in the process.
855          */
856         error = nfs_getattr_cache(vp, vap);
857         if (error)
858                 error = nfs_getattr_otw(vp, vap, cr);
859 
860         /* Return the client's view of file size */
861         rp = VTOR(vp);
862         mutex_enter(&rp->r_statelock);
863         vap->va_size = rp->r_size;
864         mutex_exit(&rp->r_statelock);
865 
866         return (error);
867 }
868 
869 /*
870  * Get attributes over-the-wire and update attributes cache
871  * if no error occurred in the over-the-wire operation.
872  * Return 0 if successful, otherwise error.
873  */
874 int
875 nfs3_getattr_otw(vnode_t *vp, struct vattr *vap, cred_t *cr)
876 {
877         int error;
878         GETATTR3args args;
879         GETATTR3vres res;
880         int douprintf;
881         failinfo_t fi;
882         hrtime_t t;
883 
884         args.object = *VTOFH3(vp);
885         fi.vp = vp;
886         fi.fhp = (caddr_t)&args.object;
887         fi.copyproc = nfs3copyfh;
888         fi.lookupproc = nfs3lookup;
889         fi.xattrdirproc = acl_getxattrdir3;
890         res.fres.vp = vp;
891         res.fres.vap = vap;
892 
893         douprintf = 1;
894 
895         t = gethrtime();
896 
897         error = rfs3call(VTOMI(vp), NFSPROC3_GETATTR,
898             xdr_nfs_fh3, (caddr_t)&args,
899             xdr_GETATTR3vres, (caddr_t)&res, cr,
900             &douprintf, &res.status, 0, &fi);
901 
902         if (error)
903                 return (error);
904 
905         error = geterrno3(res.status);
906         if (error) {
907                 PURGE_STALE_FH(error, vp, cr);
908                 return (error);
909         }
910 
911         /*
912          * Catch status codes that indicate fattr3 to vattr translation failure
913          */
914         if (res.fres.status)
915                 return (res.fres.status);
916 
917         nfs_attr_cache(vp, vap, t, cr);
918         return (0);
919 }
920 
921 /*
922  * Return either cached or remote attributes. If get remote attr
923  * use them to check and invalidate caches, then cache the new attributes.
924  */
925 int
926 nfs3getattr(vnode_t *vp, struct vattr *vap, cred_t *cr)
927 {
928         int error;
929         rnode_t *rp;
930 
931         /*
932          * If we've got cached attributes, we're done, otherwise go
933          * to the server to get attributes, which will update the cache
934          * in the process.
935          */
936         error = nfs_getattr_cache(vp, vap);
937         if (error)
938                 error = nfs3_getattr_otw(vp, vap, cr);
939 
940         /* Return the client's view of file size */
941         rp = VTOR(vp);
942         mutex_enter(&rp->r_statelock);
943         vap->va_size = rp->r_size;
944         mutex_exit(&rp->r_statelock);
945 
946         return (error);
947 }
948 
949 vtype_t nf_to_vt[] = {
950         VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK
951 };
952 /*
953  * Convert NFS Version 2 over the network attributes to the local
954  * virtual attributes.  The mapping between the UID_NOBODY/GID_NOBODY
955  * network representation and the local representation is done here.
956  * Returns 0 for success, error if failed due to overflow.
957  */
958 int
959 nattr_to_vattr(vnode_t *vp, struct nfsfattr *na, struct vattr *vap)
960 {
961         /* overflow in time attributes? */
962 #ifndef _LP64
963         if (!NFS2_FATTR_TIME_OK(na))
964                 return (EOVERFLOW);
965 #endif
966 
967         if (na->na_type < NFNON || na->na_type > NFSOC)
968                 vap->va_type = VBAD;
969         else
970                 vap->va_type = nf_to_vt[na->na_type];
971         vap->va_mode = na->na_mode;
972         vap->va_uid = (na->na_uid == NFS_UID_NOBODY) ? UID_NOBODY : na->na_uid;
973         vap->va_gid = (na->na_gid == NFS_GID_NOBODY) ? GID_NOBODY : na->na_gid;
974         vap->va_fsid = vp->v_vfsp->vfs_dev;
975         vap->va_nodeid = na->na_nodeid;
976         vap->va_nlink = na->na_nlink;
977         vap->va_size = na->na_size;       /* keep for cache validation */
978         /*
979          * nfs protocol defines times as unsigned so don't extend sign,
980          * unless sysadmin set nfs_allow_preepoch_time.
981          */
982         NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, na->na_atime.tv_sec);
983         vap->va_atime.tv_nsec = (uint32_t)(na->na_atime.tv_usec * 1000);
984         NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, na->na_mtime.tv_sec);
985         vap->va_mtime.tv_nsec = (uint32_t)(na->na_mtime.tv_usec * 1000);
986         NFS_TIME_T_CONVERT(vap->va_ctime.tv_sec, na->na_ctime.tv_sec);
987         vap->va_ctime.tv_nsec = (uint32_t)(na->na_ctime.tv_usec * 1000);
988         /*
989          * Shannon's law - uncompress the received dev_t
990          * if the top half of is zero indicating a response
991          * from an `older style' OS. Except for when it is a
992          * `new style' OS sending the maj device of zero,
993          * in which case the algorithm still works because the
994          * fact that it is a new style server
995          * is hidden by the minor device not being greater
996          * than 255 (a requirement in this case).
997          */
998         if ((na->na_rdev & 0xffff0000) == 0)
999                 vap->va_rdev = nfsv2_expdev(na->na_rdev);
1000         else
1001                 vap->va_rdev = expldev(na->na_rdev);
1002 
1003         vap->va_nblocks = na->na_blocks;
1004         switch (na->na_type) {
1005         case NFBLK:
1006                 vap->va_blksize = DEV_BSIZE;
1007                 break;
1008 
1009         case NFCHR:
1010                 vap->va_blksize = MAXBSIZE;
1011                 break;
1012 
1013         case NFSOC:
1014         default:
1015                 vap->va_blksize = na->na_blocksize;
1016                 break;
1017         }
1018         /*
1019          * This bit of ugliness is a hack to preserve the
1020          * over-the-wire protocols for named-pipe vnodes.
1021          * It remaps the special over-the-wire type to the
1022          * VFIFO type. (see note in nfs.h)
1023          */
1024         if (NA_ISFIFO(na)) {
1025                 vap->va_type = VFIFO;
1026                 vap->va_mode = (vap->va_mode & ~S_IFMT) | S_IFIFO;
1027                 vap->va_rdev = 0;
1028                 vap->va_blksize = na->na_blocksize;
1029         }
1030         vap->va_seq = 0;
1031         return (0);
1032 }
1033 
1034 /*
1035  * Convert NFS Version 3 over the network attributes to the local
1036  * virtual attributes.  The mapping between the UID_NOBODY/GID_NOBODY
1037  * network representation and the local representation is done here.
1038  */
1039 vtype_t nf3_to_vt[] = {
1040         VBAD, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO
1041 };
1042 
1043 int
1044 fattr3_to_vattr(vnode_t *vp, fattr3 *na, struct vattr *vap)
1045 {
1046 
1047 #ifndef _LP64
1048         /* overflow in time attributes? */
1049         if (!NFS3_FATTR_TIME_OK(na))
1050                 return (EOVERFLOW);
1051 #endif
1052         if (!NFS3_SIZE_OK(na->size))
1053                 /* file too big */
1054                 return (EFBIG);
1055 
1056         vap->va_mask = AT_ALL;
1057 
1058         if (na->type < NF3REG || na->type > NF3FIFO)
1059                 vap->va_type = VBAD;
1060         else
1061                 vap->va_type = nf3_to_vt[na->type];
1062         vap->va_mode = na->mode;
1063         vap->va_uid = (na->uid == NFS_UID_NOBODY) ? UID_NOBODY : (uid_t)na->uid;
1064         vap->va_gid = (na->gid == NFS_GID_NOBODY) ? GID_NOBODY : (gid_t)na->gid;
1065         vap->va_fsid = vp->v_vfsp->vfs_dev;
1066         vap->va_nodeid = na->fileid;
1067         vap->va_nlink = na->nlink;
1068         vap->va_size = na->size;
1069 
1070         /*
1071          * nfs protocol defines times as unsigned so don't extend sign,
1072          * unless sysadmin set nfs_allow_preepoch_time.
1073          */
1074         NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, na->atime.seconds);
1075         vap->va_atime.tv_nsec = (uint32_t)na->atime.nseconds;
1076         NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, na->mtime.seconds);
1077         vap->va_mtime.tv_nsec = (uint32_t)na->mtime.nseconds;
1078         NFS_TIME_T_CONVERT(vap->va_ctime.tv_sec, na->ctime.seconds);
1079         vap->va_ctime.tv_nsec = (uint32_t)na->ctime.nseconds;
1080 
1081         switch (na->type) {
1082         case NF3BLK:
1083                 vap->va_rdev = makedevice(na->rdev.specdata1,
1084                                         na->rdev.specdata2);
1085                 vap->va_blksize = DEV_BSIZE;
1086                 vap->va_nblocks = 0;
1087                 break;
1088         case NF3CHR:
1089                 vap->va_rdev = makedevice(na->rdev.specdata1,
1090                                         na->rdev.specdata2);
1091                 vap->va_blksize = MAXBSIZE;
1092                 vap->va_nblocks = 0;
1093                 break;
1094         case NF3REG:
1095         case NF3DIR:
1096         case NF3LNK:
1097                 vap->va_rdev = 0;
1098                 vap->va_blksize = MAXBSIZE;
1099                 vap->va_nblocks = (u_longlong_t)
1100                     ((na->used + (size3)DEV_BSIZE - (size3)1) /
1101                     (size3)DEV_BSIZE);
1102                 break;
1103         case NF3SOCK:
1104         case NF3FIFO:
1105         default:
1106                 vap->va_rdev = 0;
1107                 vap->va_blksize = MAXBSIZE;
1108                 vap->va_nblocks = 0;
1109                 break;
1110         }
1111         vap->va_seq = 0;
1112         return (0);
1113 }
1114 
1115 /*
1116  * Asynchronous I/O parameters.  nfs_async_threads is the high-water mark
1117  * for the demand-based allocation of async threads per-mount.  The
1118  * nfs_async_timeout is the amount of time a thread will live after it
1119  * becomes idle, unless new I/O requests are received before the thread
1120  * dies.  See nfs_async_putpage and nfs_async_start.
1121  */
1122 
1123 int nfs_async_timeout = -1;     /* uninitialized */
1124 
1125 static void     nfs_async_start(struct vfs *);
1126 
1127 static void
1128 free_async_args(struct nfs_async_reqs *args)
1129 {
1130         rnode_t *rp;
1131 
1132         if (args->a_io != NFS_INACTIVE) {
1133                 rp = VTOR(args->a_vp);
1134                 mutex_enter(&rp->r_statelock);
1135                 rp->r_count--;
1136                 if (args->a_io == NFS_PUTAPAGE ||
1137                     args->a_io == NFS_PAGEIO)
1138                         rp->r_awcount--;
1139                 cv_broadcast(&rp->r_cv);
1140                 mutex_exit(&rp->r_statelock);
1141                 VN_RELE(args->a_vp);
1142         }
1143         crfree(args->a_cred);
1144         kmem_free(args, sizeof (*args));
1145 }
1146 
1147 /*
1148  * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and
1149  * pageout(), running in the global zone, have legitimate reasons to do
1150  * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts.  We avoid the problem by
1151  * use of a a per-mount "asynchronous requests manager thread" which is
1152  * signaled by the various asynchronous work routines when there is
1153  * asynchronous work to be done.  It is responsible for creating new
1154  * worker threads if necessary, and notifying existing worker threads
1155  * that there is work to be done.
1156  *
1157  * In other words, it will "take the specifications from the customers and
1158  * give them to the engineers."
1159  *
1160  * Worker threads die off of their own accord if they are no longer
1161  * needed.
1162  *
1163  * This thread is killed when the zone is going away or the filesystem
1164  * is being unmounted.
1165  */
1166 void
1167 nfs_async_manager(vfs_t *vfsp)
1168 {
1169         callb_cpr_t cprinfo;
1170         mntinfo_t *mi;
1171         uint_t max_threads;
1172 
1173         mi = VFTOMI(vfsp);
1174 
1175         CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
1176                     "nfs_async_manager");
1177 
1178         mutex_enter(&mi->mi_async_lock);
1179         /*
1180          * We want to stash the max number of threads that this mount was
1181          * allowed so we can use it later when the variable is set to zero as
1182          * part of the zone/mount going away.
1183          *
1184          * We want to be able to create at least one thread to handle
1185          * asyncrhonous inactive calls.
1186          */
1187         max_threads = MAX(mi->mi_max_threads, 1);
1188         mutex_enter(&mi->mi_lock);
1189         /*
1190          * We don't want to wait for mi_max_threads to go to zero, since that
1191          * happens as part of a failed unmount, but this thread should only
1192          * exit when the mount/zone is really going away.
1193          *
1194          * Once MI_ASYNC_MGR_STOP is set, no more async operations will be
1195          * attempted: the various _async_*() functions know to do things
1196          * inline if mi_max_threads == 0.  Henceforth we just drain out the
1197          * outstanding requests.
1198          *
1199          * Note that we still create zthreads even if we notice the zone is
1200          * shutting down (MI_ASYNC_MGR_STOP is set); this may cause the zone
1201          * shutdown sequence to take slightly longer in some cases, but
1202          * doesn't violate the protocol, as all threads will exit as soon as
1203          * they're done processing the remaining requests.
1204          */
1205         while (!(mi->mi_flags & MI_ASYNC_MGR_STOP) ||
1206             mi->mi_async_req_count > 0) {
1207                 mutex_exit(&mi->mi_lock);
1208                 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1209                 cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock);
1210                 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1211                 while (mi->mi_async_req_count > 0) {
1212                         /*
1213                          * Paranoia: If the mount started out having
1214                          * (mi->mi_max_threads == 0), and the value was
1215                          * later changed (via a debugger or somesuch),
1216                          * we could be confused since we will think we
1217                          * can't create any threads, and the calling
1218                          * code (which looks at the current value of
1219                          * mi->mi_max_threads, now non-zero) thinks we
1220                          * can.
1221                          *
1222                          * So, because we're paranoid, we create threads
1223                          * up to the maximum of the original and the
1224                          * current value. This means that future
1225                          * (debugger-induced) lowerings of
1226                          * mi->mi_max_threads are ignored for our
1227                          * purposes, but who told them they could change
1228                          * random values on a live kernel anyhow?
1229                          */
1230                         if (mi->mi_threads <
1231                             MAX(mi->mi_max_threads, max_threads)) {
1232                                 mi->mi_threads++;
1233                                 mutex_exit(&mi->mi_async_lock);
1234                                 VFS_HOLD(vfsp); /* hold for new thread */
1235                                 (void) zthread_create(NULL, 0, nfs_async_start,
1236                                     vfsp, 0, minclsyspri);
1237                                 mutex_enter(&mi->mi_async_lock);
1238                         }
1239                         cv_signal(&mi->mi_async_work_cv);
1240                         ASSERT(mi->mi_async_req_count != 0);
1241                         mi->mi_async_req_count--;
1242                 }
1243                 mutex_enter(&mi->mi_lock);
1244         }
1245         mutex_exit(&mi->mi_lock);
1246         /*
1247          * Let everyone know we're done.
1248          */
1249         mi->mi_manager_thread = NULL;
1250         cv_broadcast(&mi->mi_async_cv);
1251 
1252         /*
1253          * There is no explicit call to mutex_exit(&mi->mi_async_lock)
1254          * since CALLB_CPR_EXIT is actually responsible for releasing
1255          * 'mi_async_lock'.
1256          */
1257         CALLB_CPR_EXIT(&cprinfo);
1258         VFS_RELE(vfsp); /* release thread's hold */
1259         zthread_exit();
1260 }
1261 
1262 /*
1263  * Signal (and wait for) the async manager thread to clean up and go away.
1264  */
1265 void
1266 nfs_async_manager_stop(vfs_t *vfsp)
1267 {
1268         mntinfo_t *mi = VFTOMI(vfsp);
1269 
1270         mutex_enter(&mi->mi_async_lock);
1271         mutex_enter(&mi->mi_lock);
1272         mi->mi_flags |= MI_ASYNC_MGR_STOP;
1273         mutex_exit(&mi->mi_lock);
1274         cv_broadcast(&mi->mi_async_reqs_cv);
1275         while (mi->mi_manager_thread != NULL)
1276                 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1277         mutex_exit(&mi->mi_async_lock);
1278 }
1279 
1280 int
1281 nfs_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr,
1282         struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *,
1283         u_offset_t, caddr_t, struct seg *, cred_t *))
1284 {
1285         rnode_t *rp;
1286         mntinfo_t *mi;
1287         struct nfs_async_reqs *args;
1288 
1289         rp = VTOR(vp);
1290         ASSERT(rp->r_freef == NULL);
1291 
1292         mi = VTOMI(vp);
1293 
1294         /*
1295          * If addr falls in a different segment, don't bother doing readahead.
1296          */
1297         if (addr >= seg->s_base + seg->s_size)
1298                 return (-1);
1299 
1300         /*
1301          * If we can't allocate a request structure, punt on the readahead.
1302          */
1303         if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1304                 return (-1);
1305 
1306         /*
1307          * If a lock operation is pending, don't initiate any new
1308          * readaheads.  Otherwise, bump r_count to indicate the new
1309          * asynchronous I/O.
1310          */
1311         if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) {
1312                 kmem_free(args, sizeof (*args));
1313                 return (-1);
1314         }
1315         mutex_enter(&rp->r_statelock);
1316         rp->r_count++;
1317         mutex_exit(&rp->r_statelock);
1318         nfs_rw_exit(&rp->r_lkserlock);
1319 
1320         args->a_next = NULL;
1321 #ifdef DEBUG
1322         args->a_queuer = curthread;
1323 #endif
1324         VN_HOLD(vp);
1325         args->a_vp = vp;
1326         ASSERT(cr != NULL);
1327         crhold(cr);
1328         args->a_cred = cr;
1329         args->a_io = NFS_READ_AHEAD;
1330         args->a_nfs_readahead = readahead;
1331         args->a_nfs_blkoff = blkoff;
1332         args->a_nfs_seg = seg;
1333         args->a_nfs_addr = addr;
1334 
1335         mutex_enter(&mi->mi_async_lock);
1336 
1337         /*
1338          * If asyncio has been disabled, don't bother readahead.
1339          */
1340         if (mi->mi_max_threads == 0) {
1341                 mutex_exit(&mi->mi_async_lock);
1342                 goto noasync;
1343         }
1344 
1345         /*
1346          * Link request structure into the async list and
1347          * wakeup async thread to do the i/o.
1348          */
1349         if (mi->mi_async_reqs[NFS_READ_AHEAD] == NULL) {
1350                 mi->mi_async_reqs[NFS_READ_AHEAD] = args;
1351                 mi->mi_async_tail[NFS_READ_AHEAD] = args;
1352         } else {
1353                 mi->mi_async_tail[NFS_READ_AHEAD]->a_next = args;
1354                 mi->mi_async_tail[NFS_READ_AHEAD] = args;
1355         }
1356 
1357         if (mi->mi_io_kstats) {
1358                 mutex_enter(&mi->mi_lock);
1359                 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1360                 mutex_exit(&mi->mi_lock);
1361         }
1362 
1363         mi->mi_async_req_count++;
1364         ASSERT(mi->mi_async_req_count != 0);
1365         cv_signal(&mi->mi_async_reqs_cv);
1366         mutex_exit(&mi->mi_async_lock);
1367         return (0);
1368 
1369 noasync:
1370         mutex_enter(&rp->r_statelock);
1371         rp->r_count--;
1372         cv_broadcast(&rp->r_cv);
1373         mutex_exit(&rp->r_statelock);
1374         VN_RELE(vp);
1375         crfree(cr);
1376         kmem_free(args, sizeof (*args));
1377         return (-1);
1378 }
1379 
1380 int
1381 nfs_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
1382         int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *,
1383         u_offset_t, size_t, int, cred_t *))
1384 {
1385         rnode_t *rp;
1386         mntinfo_t *mi;
1387         struct nfs_async_reqs *args;
1388 
1389         ASSERT(flags & B_ASYNC);
1390         ASSERT(vp->v_vfsp != NULL);
1391 
1392         rp = VTOR(vp);
1393         ASSERT(rp->r_count > 0);
1394 
1395         mi = VTOMI(vp);
1396 
1397         /*
1398          * If we can't allocate a request structure, do the putpage
1399          * operation synchronously in this thread's context.
1400          */
1401         if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1402                 goto noasync;
1403 
1404         args->a_next = NULL;
1405 #ifdef DEBUG
1406         args->a_queuer = curthread;
1407 #endif
1408         VN_HOLD(vp);
1409         args->a_vp = vp;
1410         ASSERT(cr != NULL);
1411         crhold(cr);
1412         args->a_cred = cr;
1413         args->a_io = NFS_PUTAPAGE;
1414         args->a_nfs_putapage = putapage;
1415         args->a_nfs_pp = pp;
1416         args->a_nfs_off = off;
1417         args->a_nfs_len = (uint_t)len;
1418         args->a_nfs_flags = flags;
1419 
1420         mutex_enter(&mi->mi_async_lock);
1421 
1422         /*
1423          * If asyncio has been disabled, then make a synchronous request.
1424          * This check is done a second time in case async io was diabled
1425          * while this thread was blocked waiting for memory pressure to
1426          * reduce or for the queue to drain.
1427          */
1428         if (mi->mi_max_threads == 0) {
1429                 mutex_exit(&mi->mi_async_lock);
1430                 goto noasync;
1431         }
1432 
1433         /*
1434          * Link request structure into the async list and
1435          * wakeup async thread to do the i/o.
1436          */
1437         if (mi->mi_async_reqs[NFS_PUTAPAGE] == NULL) {
1438                 mi->mi_async_reqs[NFS_PUTAPAGE] = args;
1439                 mi->mi_async_tail[NFS_PUTAPAGE] = args;
1440         } else {
1441                 mi->mi_async_tail[NFS_PUTAPAGE]->a_next = args;
1442                 mi->mi_async_tail[NFS_PUTAPAGE] = args;
1443         }
1444 
1445         mutex_enter(&rp->r_statelock);
1446         rp->r_count++;
1447         rp->r_awcount++;
1448         mutex_exit(&rp->r_statelock);
1449 
1450         if (mi->mi_io_kstats) {
1451                 mutex_enter(&mi->mi_lock);
1452                 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1453                 mutex_exit(&mi->mi_lock);
1454         }
1455 
1456         mi->mi_async_req_count++;
1457         ASSERT(mi->mi_async_req_count != 0);
1458         cv_signal(&mi->mi_async_reqs_cv);
1459         mutex_exit(&mi->mi_async_lock);
1460         return (0);
1461 
1462 noasync:
1463         if (args != NULL) {
1464                 VN_RELE(vp);
1465                 crfree(cr);
1466                 kmem_free(args, sizeof (*args));
1467         }
1468 
1469         if (curproc == proc_pageout || curproc == proc_fsflush) {
1470                 /*
1471                  * If we get here in the context of the pageout/fsflush,
1472                  * we refuse to do a sync write, because this may hang
1473                  * pageout (and the machine). In this case, we just
1474                  * re-mark the page as dirty and punt on the page.
1475                  *
1476                  * Make sure B_FORCE isn't set.  We can re-mark the
1477                  * pages as dirty and unlock the pages in one swoop by
1478                  * passing in B_ERROR to pvn_write_done().  However,
1479                  * we should make sure B_FORCE isn't set - we don't
1480                  * want the page tossed before it gets written out.
1481                  */
1482                 if (flags & B_FORCE)
1483                         flags &= ~(B_INVAL | B_FORCE);
1484                 pvn_write_done(pp, flags | B_ERROR);
1485                 return (0);
1486         }
1487         if (nfs_zone() != mi->mi_zone) {
1488                 /*
1489                  * So this was a cross-zone sync putpage.  We pass in B_ERROR
1490                  * to pvn_write_done() to re-mark the pages as dirty and unlock
1491                  * them.
1492                  *
1493                  * We don't want to clear B_FORCE here as the caller presumably
1494                  * knows what they're doing if they set it.
1495                  */
1496                 pvn_write_done(pp, flags | B_ERROR);
1497                 return (EPERM);
1498         }
1499         return ((*putapage)(vp, pp, off, len, flags, cr));
1500 }
1501 
1502 int
1503 nfs_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
1504         int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t,
1505         size_t, int, cred_t *))
1506 {
1507         rnode_t *rp;
1508         mntinfo_t *mi;
1509         struct nfs_async_reqs *args;
1510 
1511         ASSERT(flags & B_ASYNC);
1512         ASSERT(vp->v_vfsp != NULL);
1513 
1514         rp = VTOR(vp);
1515         ASSERT(rp->r_count > 0);
1516 
1517         mi = VTOMI(vp);
1518 
1519         /*
1520          * If we can't allocate a request structure, do the pageio
1521          * request synchronously in this thread's context.
1522          */
1523         if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1524                 goto noasync;
1525 
1526         args->a_next = NULL;
1527 #ifdef DEBUG
1528         args->a_queuer = curthread;
1529 #endif
1530         VN_HOLD(vp);
1531         args->a_vp = vp;
1532         ASSERT(cr != NULL);
1533         crhold(cr);
1534         args->a_cred = cr;
1535         args->a_io = NFS_PAGEIO;
1536         args->a_nfs_pageio = pageio;
1537         args->a_nfs_pp = pp;
1538         args->a_nfs_off = io_off;
1539         args->a_nfs_len = (uint_t)io_len;
1540         args->a_nfs_flags = flags;
1541 
1542         mutex_enter(&mi->mi_async_lock);
1543 
1544         /*
1545          * If asyncio has been disabled, then make a synchronous request.
1546          * This check is done a second time in case async io was diabled
1547          * while this thread was blocked waiting for memory pressure to
1548          * reduce or for the queue to drain.
1549          */
1550         if (mi->mi_max_threads == 0) {
1551                 mutex_exit(&mi->mi_async_lock);
1552                 goto noasync;
1553         }
1554 
1555         /*
1556          * Link request structure into the async list and
1557          * wakeup async thread to do the i/o.
1558          */
1559         if (mi->mi_async_reqs[NFS_PAGEIO] == NULL) {
1560                 mi->mi_async_reqs[NFS_PAGEIO] = args;
1561                 mi->mi_async_tail[NFS_PAGEIO] = args;
1562         } else {
1563                 mi->mi_async_tail[NFS_PAGEIO]->a_next = args;
1564                 mi->mi_async_tail[NFS_PAGEIO] = args;
1565         }
1566 
1567         mutex_enter(&rp->r_statelock);
1568         rp->r_count++;
1569         rp->r_awcount++;
1570         mutex_exit(&rp->r_statelock);
1571 
1572         if (mi->mi_io_kstats) {
1573                 mutex_enter(&mi->mi_lock);
1574                 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1575                 mutex_exit(&mi->mi_lock);
1576         }
1577 
1578         mi->mi_async_req_count++;
1579         ASSERT(mi->mi_async_req_count != 0);
1580         cv_signal(&mi->mi_async_reqs_cv);
1581         mutex_exit(&mi->mi_async_lock);
1582         return (0);
1583 
1584 noasync:
1585         if (args != NULL) {
1586                 VN_RELE(vp);
1587                 crfree(cr);
1588                 kmem_free(args, sizeof (*args));
1589         }
1590 
1591         /*
1592          * If we can't do it ASYNC, for reads we do nothing (but cleanup
1593          * the page list), for writes we do it synchronously, except for
1594          * proc_pageout/proc_fsflush as described below.
1595          */
1596         if (flags & B_READ) {
1597                 pvn_read_done(pp, flags | B_ERROR);
1598                 return (0);
1599         }
1600 
1601         if (curproc == proc_pageout || curproc == proc_fsflush) {
1602                 /*
1603                  * If we get here in the context of the pageout/fsflush,
1604                  * we refuse to do a sync write, because this may hang
1605                  * pageout/fsflush (and the machine). In this case, we just
1606                  * re-mark the page as dirty and punt on the page.
1607                  *
1608                  * Make sure B_FORCE isn't set.  We can re-mark the
1609                  * pages as dirty and unlock the pages in one swoop by
1610                  * passing in B_ERROR to pvn_write_done().  However,
1611                  * we should make sure B_FORCE isn't set - we don't
1612                  * want the page tossed before it gets written out.
1613                  */
1614                 if (flags & B_FORCE)
1615                         flags &= ~(B_INVAL | B_FORCE);
1616                 pvn_write_done(pp, flags | B_ERROR);
1617                 return (0);
1618         }
1619 
1620         if (nfs_zone() != mi->mi_zone) {
1621                 /*
1622                  * So this was a cross-zone sync pageio.  We pass in B_ERROR
1623                  * to pvn_write_done() to re-mark the pages as dirty and unlock
1624                  * them.
1625                  *
1626                  * We don't want to clear B_FORCE here as the caller presumably
1627                  * knows what they're doing if they set it.
1628                  */
1629                 pvn_write_done(pp, flags | B_ERROR);
1630                 return (EPERM);
1631         }
1632         return ((*pageio)(vp, pp, io_off, io_len, flags, cr));
1633 }
1634 
1635 void
1636 nfs_async_readdir(vnode_t *vp, rddir_cache *rdc, cred_t *cr,
1637         int (*readdir)(vnode_t *, rddir_cache *, cred_t *))
1638 {
1639         rnode_t *rp;
1640         mntinfo_t *mi;
1641         struct nfs_async_reqs *args;
1642 
1643         rp = VTOR(vp);
1644         ASSERT(rp->r_freef == NULL);
1645 
1646         mi = VTOMI(vp);
1647 
1648         /*
1649          * If we can't allocate a request structure, do the readdir
1650          * operation synchronously in this thread's context.
1651          */
1652         if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1653                 goto noasync;
1654 
1655         args->a_next = NULL;
1656 #ifdef DEBUG
1657         args->a_queuer = curthread;
1658 #endif
1659         VN_HOLD(vp);
1660         args->a_vp = vp;
1661         ASSERT(cr != NULL);
1662         crhold(cr);
1663         args->a_cred = cr;
1664         args->a_io = NFS_READDIR;
1665         args->a_nfs_readdir = readdir;
1666         args->a_nfs_rdc = rdc;
1667 
1668         mutex_enter(&mi->mi_async_lock);
1669 
1670         /*
1671          * If asyncio has been disabled, then make a synchronous request.
1672          */
1673         if (mi->mi_max_threads == 0) {
1674                 mutex_exit(&mi->mi_async_lock);
1675                 goto noasync;
1676         }
1677 
1678         /*
1679          * Link request structure into the async list and
1680          * wakeup async thread to do the i/o.
1681          */
1682         if (mi->mi_async_reqs[NFS_READDIR] == NULL) {
1683                 mi->mi_async_reqs[NFS_READDIR] = args;
1684                 mi->mi_async_tail[NFS_READDIR] = args;
1685         } else {
1686                 mi->mi_async_tail[NFS_READDIR]->a_next = args;
1687                 mi->mi_async_tail[NFS_READDIR] = args;
1688         }
1689 
1690         mutex_enter(&rp->r_statelock);
1691         rp->r_count++;
1692         mutex_exit(&rp->r_statelock);
1693 
1694         if (mi->mi_io_kstats) {
1695                 mutex_enter(&mi->mi_lock);
1696                 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1697                 mutex_exit(&mi->mi_lock);
1698         }
1699 
1700         mi->mi_async_req_count++;
1701         ASSERT(mi->mi_async_req_count != 0);
1702         cv_signal(&mi->mi_async_reqs_cv);
1703         mutex_exit(&mi->mi_async_lock);
1704         return;
1705 
1706 noasync:
1707         if (args != NULL) {
1708                 VN_RELE(vp);
1709                 crfree(cr);
1710                 kmem_free(args, sizeof (*args));
1711         }
1712 
1713         rdc->entries = NULL;
1714         mutex_enter(&rp->r_statelock);
1715         ASSERT(rdc->flags & RDDIR);
1716         rdc->flags &= ~RDDIR;
1717         rdc->flags |= RDDIRREQ;
1718         /*
1719          * Check the flag to see if RDDIRWAIT is set. If RDDIRWAIT
1720          * is set, wakeup the thread sleeping in cv_wait_sig().
1721          * The woken up thread will reset the flag to RDDIR and will
1722          * continue with the readdir opeartion.
1723          */
1724         if (rdc->flags & RDDIRWAIT) {
1725                 rdc->flags &= ~RDDIRWAIT;
1726                 cv_broadcast(&rdc->cv);
1727         }
1728         mutex_exit(&rp->r_statelock);
1729         rddir_cache_rele(rdc);
1730 }
1731 
1732 void
1733 nfs_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
1734         cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3,
1735         cred_t *))
1736 {
1737         rnode_t *rp;
1738         mntinfo_t *mi;
1739         struct nfs_async_reqs *args;
1740         page_t *pp;
1741 
1742         rp = VTOR(vp);
1743         mi = VTOMI(vp);
1744 
1745         /*
1746          * If we can't allocate a request structure, do the commit
1747          * operation synchronously in this thread's context.
1748          */
1749         if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1750                 goto noasync;
1751 
1752         args->a_next = NULL;
1753 #ifdef DEBUG
1754         args->a_queuer = curthread;
1755 #endif
1756         VN_HOLD(vp);
1757         args->a_vp = vp;
1758         ASSERT(cr != NULL);
1759         crhold(cr);
1760         args->a_cred = cr;
1761         args->a_io = NFS_COMMIT;
1762         args->a_nfs_commit = commit;
1763         args->a_nfs_plist = plist;
1764         args->a_nfs_offset = offset;
1765         args->a_nfs_count = count;
1766 
1767         mutex_enter(&mi->mi_async_lock);
1768 
1769         /*
1770          * If asyncio has been disabled, then make a synchronous request.
1771          * This check is done a second time in case async io was diabled
1772          * while this thread was blocked waiting for memory pressure to
1773          * reduce or for the queue to drain.
1774          */
1775         if (mi->mi_max_threads == 0) {
1776                 mutex_exit(&mi->mi_async_lock);
1777                 goto noasync;
1778         }
1779 
1780         /*
1781          * Link request structure into the async list and
1782          * wakeup async thread to do the i/o.
1783          */
1784         if (mi->mi_async_reqs[NFS_COMMIT] == NULL) {
1785                 mi->mi_async_reqs[NFS_COMMIT] = args;
1786                 mi->mi_async_tail[NFS_COMMIT] = args;
1787         } else {
1788                 mi->mi_async_tail[NFS_COMMIT]->a_next = args;
1789                 mi->mi_async_tail[NFS_COMMIT] = args;
1790         }
1791 
1792         mutex_enter(&rp->r_statelock);
1793         rp->r_count++;
1794         mutex_exit(&rp->r_statelock);
1795 
1796         if (mi->mi_io_kstats) {
1797                 mutex_enter(&mi->mi_lock);
1798                 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1799                 mutex_exit(&mi->mi_lock);
1800         }
1801 
1802         mi->mi_async_req_count++;
1803         ASSERT(mi->mi_async_req_count != 0);
1804         cv_signal(&mi->mi_async_reqs_cv);
1805         mutex_exit(&mi->mi_async_lock);
1806         return;
1807 
1808 noasync:
1809         if (args != NULL) {
1810                 VN_RELE(vp);
1811                 crfree(cr);
1812                 kmem_free(args, sizeof (*args));
1813         }
1814 
1815         if (curproc == proc_pageout || curproc == proc_fsflush ||
1816             nfs_zone() != mi->mi_zone) {
1817                 while (plist != NULL) {
1818                         pp = plist;
1819                         page_sub(&plist, pp);
1820                         pp->p_fsdata = C_COMMIT;
1821                         page_unlock(pp);
1822                 }
1823                 return;
1824         }
1825         (*commit)(vp, plist, offset, count, cr);
1826 }
1827 
1828 void
1829 nfs_async_inactive(vnode_t *vp, cred_t *cr,
1830     void (*inactive)(vnode_t *, cred_t *))
1831 {
1832         mntinfo_t *mi;
1833         struct nfs_async_reqs *args;
1834 
1835         mi = VTOMI(vp);
1836 
1837         args = kmem_alloc(sizeof (*args), KM_SLEEP);
1838         args->a_next = NULL;
1839 #ifdef DEBUG
1840         args->a_queuer = curthread;
1841 #endif
1842         args->a_vp = vp;
1843         ASSERT(cr != NULL);
1844         crhold(cr);
1845         args->a_cred = cr;
1846         args->a_io = NFS_INACTIVE;
1847         args->a_nfs_inactive = inactive;
1848 
1849         /*
1850          * Note that we don't check mi->mi_max_threads here, since we
1851          * *need* to get rid of this vnode regardless of whether someone
1852          * set nfs3_max_threads/nfs_max_threads to zero in /etc/system.
1853          *
1854          * The manager thread knows about this and is willing to create
1855          * at least one thread to accomodate us.
1856          */
1857         mutex_enter(&mi->mi_async_lock);
1858         if (mi->mi_manager_thread == NULL) {
1859                 rnode_t *rp = VTOR(vp);
1860 
1861                 mutex_exit(&mi->mi_async_lock);
1862                 crfree(cr);     /* drop our reference */
1863                 kmem_free(args, sizeof (*args));
1864                 /*
1865                  * We can't do an over-the-wire call since we're in the wrong
1866                  * zone, so we need to clean up state as best we can and then
1867                  * throw away the vnode.
1868                  */
1869                 mutex_enter(&rp->r_statelock);
1870                 if (rp->r_unldvp != NULL) {
1871                         vnode_t *unldvp;
1872                         char *unlname;
1873                         cred_t *unlcred;
1874 
1875                         unldvp = rp->r_unldvp;
1876                         rp->r_unldvp = NULL;
1877                         unlname = rp->r_unlname;
1878                         rp->r_unlname = NULL;
1879                         unlcred = rp->r_unlcred;
1880                         rp->r_unlcred = NULL;
1881                         mutex_exit(&rp->r_statelock);
1882 
1883                         VN_RELE(unldvp);
1884                         kmem_free(unlname, MAXNAMELEN);
1885                         crfree(unlcred);
1886                 } else {
1887                         mutex_exit(&rp->r_statelock);
1888                 }
1889                 /*
1890                  * No need to explicitly throw away any cached pages.  The
1891                  * eventual rinactive() will attempt a synchronous
1892                  * VOP_PUTPAGE() which will immediately fail since the request
1893                  * is coming from the wrong zone, and then will proceed to call
1894                  * nfs_invalidate_pages() which will clean things up for us.
1895                  */
1896                 rp_addfree(VTOR(vp), cr);
1897                 return;
1898         }
1899 
1900         if (mi->mi_async_reqs[NFS_INACTIVE] == NULL) {
1901                 mi->mi_async_reqs[NFS_INACTIVE] = args;
1902         } else {
1903                 mi->mi_async_tail[NFS_INACTIVE]->a_next = args;
1904         }
1905         mi->mi_async_tail[NFS_INACTIVE] = args;
1906         /*
1907          * Don't increment r_count, since we're trying to get rid of the vnode.
1908          */
1909 
1910         mi->mi_async_req_count++;
1911         ASSERT(mi->mi_async_req_count != 0);
1912         cv_signal(&mi->mi_async_reqs_cv);
1913         mutex_exit(&mi->mi_async_lock);
1914 }
1915 
1916 /*
1917  * The async queues for each mounted file system are arranged as a
1918  * set of queues, one for each async i/o type.  Requests are taken
1919  * from the queues in a round-robin fashion.  A number of consecutive
1920  * requests are taken from each queue before moving on to the next
1921  * queue.  This functionality may allow the NFS Version 2 server to do
1922  * write clustering, even if the client is mixing writes and reads
1923  * because it will take multiple write requests from the queue
1924  * before processing any of the other async i/o types.
1925  *
1926  * XXX The nfs_async_start thread is unsafe in the light of the present
1927  * model defined by cpr to suspend the system. Specifically over the
1928  * wire calls are cpr-unsafe. The thread should be reevaluated in
1929  * case of future updates to the cpr model.
1930  */
1931 static void
1932 nfs_async_start(struct vfs *vfsp)
1933 {
1934         struct nfs_async_reqs *args;
1935         mntinfo_t *mi = VFTOMI(vfsp);
1936         clock_t time_left = 1;
1937         callb_cpr_t cprinfo;
1938         int i;
1939 
1940         /*
1941          * Dynamic initialization of nfs_async_timeout to allow nfs to be
1942          * built in an implementation independent manner.
1943          */
1944         if (nfs_async_timeout == -1)
1945                 nfs_async_timeout = NFS_ASYNC_TIMEOUT;
1946 
1947         CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas");
1948 
1949         mutex_enter(&mi->mi_async_lock);
1950         for (;;) {
1951                 /*
1952                  * Find the next queue containing an entry.  We start
1953                  * at the current queue pointer and then round robin
1954                  * through all of them until we either find a non-empty
1955                  * queue or have looked through all of them.
1956                  */
1957                 for (i = 0; i < NFS_ASYNC_TYPES; i++) {
1958                         args = *mi->mi_async_curr;
1959                         if (args != NULL)
1960                                 break;
1961                         mi->mi_async_curr++;
1962                         if (mi->mi_async_curr ==
1963                             &mi->mi_async_reqs[NFS_ASYNC_TYPES])
1964                                 mi->mi_async_curr = &mi->mi_async_reqs[0];
1965                 }
1966                 /*
1967                  * If we didn't find a entry, then block until woken up
1968                  * again and then look through the queues again.
1969                  */
1970                 if (args == NULL) {
1971                         /*
1972                          * Exiting is considered to be safe for CPR as well
1973                          */
1974                         CALLB_CPR_SAFE_BEGIN(&cprinfo);
1975 
1976                         /*
1977                          * Wakeup thread waiting to unmount the file
1978                          * system only if all async threads are inactive.
1979                          *
1980                          * If we've timed-out and there's nothing to do,
1981                          * then get rid of this thread.
1982                          */
1983                         if (mi->mi_max_threads == 0 || time_left <= 0) {
1984                                 if (--mi->mi_threads == 0)
1985                                         cv_signal(&mi->mi_async_cv);
1986                                 CALLB_CPR_EXIT(&cprinfo);
1987                                 VFS_RELE(vfsp); /* release thread's hold */
1988                                 zthread_exit();
1989                                 /* NOTREACHED */
1990                         }
1991                         time_left = cv_timedwait(&mi->mi_async_work_cv,
1992                             &mi->mi_async_lock, nfs_async_timeout + lbolt);
1993 
1994                         CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1995 
1996                         continue;
1997                 }
1998                 time_left = 1;
1999 
2000                 /*
2001                  * Remove the request from the async queue and then
2002                  * update the current async request queue pointer.  If
2003                  * the current queue is empty or we have removed enough
2004                  * consecutive entries from it, then reset the counter
2005                  * for this queue and then move the current pointer to
2006                  * the next queue.
2007                  */
2008                 *mi->mi_async_curr = args->a_next;
2009                 if (*mi->mi_async_curr == NULL ||
2010                     --mi->mi_async_clusters[args->a_io] == 0) {
2011                         mi->mi_async_clusters[args->a_io] =
2012                                                 mi->mi_async_init_clusters;
2013                         mi->mi_async_curr++;
2014                         if (mi->mi_async_curr ==
2015                             &mi->mi_async_reqs[NFS_ASYNC_TYPES])
2016                                 mi->mi_async_curr = &mi->mi_async_reqs[0];
2017                 }
2018 
2019                 if (args->a_io != NFS_INACTIVE && mi->mi_io_kstats) {
2020                         mutex_enter(&mi->mi_lock);
2021                         kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
2022                         mutex_exit(&mi->mi_lock);
2023                 }
2024 
2025                 mutex_exit(&mi->mi_async_lock);
2026 
2027                 /*
2028                  * Obtain arguments from the async request structure.
2029                  */
2030                 if (args->a_io == NFS_READ_AHEAD && mi->mi_max_threads > 0) {
2031                         (*args->a_nfs_readahead)(args->a_vp, args->a_nfs_blkoff,
2032                                         args->a_nfs_addr, args->a_nfs_seg,
2033                                         args->a_cred);
2034                 } else if (args->a_io == NFS_PUTAPAGE) {
2035                         (void) (*args->a_nfs_putapage)(args->a_vp,
2036                                         args->a_nfs_pp, args->a_nfs_off,
2037                                         args->a_nfs_len, args->a_nfs_flags,
2038                                         args->a_cred);
2039                 } else if (args->a_io == NFS_PAGEIO) {
2040                         (void) (*args->a_nfs_pageio)(args->a_vp,
2041                                         args->a_nfs_pp, args->a_nfs_off,
2042                                         args->a_nfs_len, args->a_nfs_flags,
2043                                         args->a_cred);
2044                 } else if (args->a_io == NFS_READDIR) {
2045                         (void) ((*args->a_nfs_readdir)(args->a_vp,
2046                                         args->a_nfs_rdc, args->a_cred));
2047                 } else if (args->a_io == NFS_COMMIT) {
2048                         (*args->a_nfs_commit)(args->a_vp, args->a_nfs_plist,
2049                                         args->a_nfs_offset, args->a_nfs_count,
2050                                         args->a_cred);
2051                 } else if (args->a_io == NFS_INACTIVE) {
2052                         (*args->a_nfs_inactive)(args->a_vp, args->a_cred);
2053                 }
2054 
2055                 /*
2056                  * Now, release the vnode and free the credentials
2057                  * structure.
2058                  */
2059                 free_async_args(args);
2060                 /*
2061                  * Reacquire the mutex because it will be needed above.
2062                  */
2063                 mutex_enter(&mi->mi_async_lock);
2064         }
2065 }
2066 
2067 void
2068 nfs_async_stop(struct vfs *vfsp)
2069 {
2070         mntinfo_t *mi = VFTOMI(vfsp);
2071 
2072         /*
2073          * Wait for all outstanding async operations to complete and for the
2074          * worker threads to exit.
2075          */
2076         mutex_enter(&mi->mi_async_lock);
2077         mi->mi_max_threads = 0;
2078         cv_broadcast(&mi->mi_async_work_cv);
2079         while (mi->mi_threads != 0)
2080                 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
2081         mutex_exit(&mi->mi_async_lock);
2082 }
2083 
2084 /*
2085  * nfs_async_stop_sig:
2086  * Wait for all outstanding putpage operation to complete. If a signal
2087  * is deliver we will abort and return non-zero. If we can put all the
2088  * pages we will return 0. This routine is called from nfs_unmount and
2089  * nfs3_unmount to make these operations interruptable.
2090  */
2091 int
2092 nfs_async_stop_sig(struct vfs *vfsp)
2093 {
2094         mntinfo_t *mi = VFTOMI(vfsp);
2095         ushort_t omax;
2096         int rval;
2097 
2098         /*
2099          * Wait for all outstanding async operations to complete and for the
2100          * worker threads to exit.
2101          */
2102         mutex_enter(&mi->mi_async_lock);
2103         omax = mi->mi_max_threads;
2104         mi->mi_max_threads = 0;
2105         /*
2106          * Tell all the worker threads to exit.
2107          */
2108         cv_broadcast(&mi->mi_async_work_cv);
2109         while (mi->mi_threads != 0) {
2110                 if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock))
2111                         break;
2112         }
2113         rval = (mi->mi_threads != 0);        /* Interrupted */
2114         if (rval)
2115                 mi->mi_max_threads = omax;
2116         mutex_exit(&mi->mi_async_lock);
2117 
2118         return (rval);
2119 }
2120 
2121 int
2122 writerp(rnode_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated)
2123 {
2124         int pagecreate;
2125         int n;
2126         int saved_n;
2127         caddr_t saved_base;
2128         u_offset_t offset;
2129         int error;
2130         int sm_error;
2131 
2132         ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid);
2133         ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE);
2134         ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER));
2135 
2136         /*
2137          * Move bytes in at most PAGESIZE chunks. We must avoid
2138          * spanning pages in uiomove() because page faults may cause
2139          * the cache to be invalidated out from under us. The r_size is not
2140          * updated until after the uiomove. If we push the last page of a
2141          * file before r_size is correct, we will lose the data written past
2142          * the current (and invalid) r_size.
2143          */
2144         do {
2145                 offset = uio->uio_loffset;
2146                 pagecreate = 0;
2147 
2148                 /*
2149                  * n is the number of bytes required to satisfy the request
2150                  *   or the number of bytes to fill out the page.
2151                  */
2152                 n = (int)MIN((PAGESIZE - ((uintptr_t)base & PAGEOFFSET)),
2153                     tcount);
2154 
2155                 /*
2156                  * Check to see if we can skip reading in the page
2157                  * and just allocate the memory.  We can do this
2158                  * if we are going to rewrite the entire mapping
2159                  * or if we are going to write to or beyond the current
2160                  * end of file from the beginning of the mapping.
2161                  *
2162                  * The read of r_size is now protected by r_statelock.
2163                  */
2164                 mutex_enter(&rp->r_statelock);
2165                 /*
2166                  * When pgcreated is nonzero the caller has already done
2167                  * a segmap_getmapflt with forcefault 0 and S_WRITE. With
2168                  * segkpm this means we already have at least one page
2169                  * created and mapped at base.
2170                  */
2171                 pagecreate = pgcreated ||
2172                         (((uintptr_t)base & PAGEOFFSET) == 0 &&
2173                         (n == PAGESIZE || ((offset + n) >= rp->r_size)));
2174 
2175                 mutex_exit(&rp->r_statelock);
2176                 if (pagecreate) {
2177                         /*
2178                          * The last argument tells segmap_pagecreate() to
2179                          * always lock the page, as opposed to sometimes
2180                          * returning with the page locked. This way we avoid a
2181                          * fault on the ensuing uiomove(), but also
2182                          * more importantly (to fix bug 1094402) we can
2183                          * call segmap_fault() to unlock the page in all
2184                          * cases. An alternative would be to modify
2185                          * segmap_pagecreate() to tell us when it is
2186                          * locking a page, but that's a fairly major
2187                          * interface change.
2188                          */
2189                         if (pgcreated == 0)
2190                                 (void) segmap_pagecreate(segkmap, base,
2191                                                         (uint_t)n, 1);
2192                         saved_base = base;
2193                         saved_n = n;
2194                 }
2195 
2196                 /*
2197                  * The number of bytes of data in the last page can not
2198                  * be accurately be determined while page is being
2199                  * uiomove'd to and the size of the file being updated.
2200                  * Thus, inform threads which need to know accurately
2201                  * how much data is in the last page of the file.  They
2202                  * will not do the i/o immediately, but will arrange for
2203                  * the i/o to happen later when this modify operation
2204                  * will have finished.
2205                  */
2206                 ASSERT(!(rp->r_flags & RMODINPROGRESS));
2207                 mutex_enter(&rp->r_statelock);
2208                 rp->r_flags |= RMODINPROGRESS;
2209                 rp->r_modaddr = (offset & MAXBMASK);
2210                 mutex_exit(&rp->r_statelock);
2211 
2212                 error = uiomove(base, n, UIO_WRITE, uio);
2213 
2214                 /*
2215                  * r_size is the maximum number of
2216                  * bytes known to be in the file.
2217                  * Make sure it is at least as high as the
2218                  * first unwritten byte pointed to by uio_loffset.
2219                  */
2220                 mutex_enter(&rp->r_statelock);
2221                 if (rp->r_size < uio->uio_loffset)
2222                         rp->r_size = uio->uio_loffset;
2223                 rp->r_flags &= ~RMODINPROGRESS;
2224                 rp->r_flags |= RDIRTY;
2225                 mutex_exit(&rp->r_statelock);
2226 
2227                 /* n = # of bytes written */
2228                 n = (int)(uio->uio_loffset - offset);
2229                 base += n;
2230                 tcount -= n;
2231                 /*
2232                  * If we created pages w/o initializing them completely,
2233                  * we need to zero the part that wasn't set up.
2234                  * This happens on a most EOF write cases and if
2235                  * we had some sort of error during the uiomove.
2236                  */
2237                 if (pagecreate) {
2238                         if ((uio->uio_loffset & PAGEOFFSET) || n == 0)
2239                                 (void) kzero(base, PAGESIZE - n);
2240 
2241                         if (pgcreated) {
2242                                 /*
2243                                  * Caller is responsible for this page,
2244                                  * it was not created in this loop.
2245                                  */
2246                                 pgcreated = 0;
2247                         } else {
2248                                 /*
2249                                  * For bug 1094402: segmap_pagecreate locks
2250                                  * page. Unlock it. This also unlocks the
2251                                  * pages allocated by page_create_va() in
2252                                  * segmap_pagecreate().
2253                                  */
2254                                 sm_error = segmap_fault(kas.a_hat, segkmap,
2255                                                 saved_base, saved_n,
2256                                                 F_SOFTUNLOCK, S_WRITE);
2257                                 if (error == 0)
2258                                         error = sm_error;
2259                         }
2260                 }
2261         } while (tcount > 0 && error == 0);
2262 
2263         return (error);
2264 }
2265 
2266 int
2267 nfs_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr)
2268 {
2269         rnode_t *rp;
2270         page_t *pp;
2271         u_offset_t eoff;
2272         u_offset_t io_off;
2273         size_t io_len;
2274         int error;
2275         int rdirty;
2276         int err;
2277 
2278         rp = VTOR(vp);
2279         ASSERT(rp->r_count > 0);
2280 
2281         if (!vn_has_cached_data(vp))
2282                 return (0);
2283 
2284         ASSERT(vp->v_type != VCHR);
2285 
2286         /*
2287          * If ROUTOFSPACE is set, then all writes turn into B_INVAL
2288          * writes.  B_FORCE is set to force the VM system to actually
2289          * invalidate the pages, even if the i/o failed.  The pages
2290          * need to get invalidated because they can't be written out
2291          * because there isn't any space left on either the server's
2292          * file system or in the user's disk quota.  The B_FREE bit
2293          * is cleared to avoid confusion as to whether this is a
2294          * request to place the page on the freelist or to destroy
2295          * it.
2296          */
2297         if ((rp->r_flags & ROUTOFSPACE) ||
2298             (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
2299                 flags = (flags & ~B_FREE) | B_INVAL | B_FORCE;
2300 
2301         if (len == 0) {
2302                 /*
2303                  * If doing a full file synchronous operation, then clear
2304                  * the RDIRTY bit.  If a page gets dirtied while the flush
2305                  * is happening, then RDIRTY will get set again.  The
2306                  * RDIRTY bit must get cleared before the flush so that
2307                  * we don't lose this information.
2308                  */
2309                 if (off == (u_offset_t)0 &&
2310                     !(flags & B_ASYNC) &&
2311                     (rp->r_flags & RDIRTY)) {
2312                         mutex_enter(&rp->r_statelock);
2313                         rdirty = (rp->r_flags & RDIRTY);
2314                         rp->r_flags &= ~RDIRTY;
2315                         mutex_exit(&rp->r_statelock);
2316                 } else
2317                         rdirty = 0;
2318 
2319                 /*
2320                  * Search the entire vp list for pages >= off, and flush
2321                  * the dirty pages.
2322                  */
2323                 error = pvn_vplist_dirty(vp, off, rp->r_putapage,
2324                                         flags, cr);
2325 
2326                 /*
2327                  * If an error occured and the file was marked as dirty
2328                  * before and we aren't forcibly invalidating pages, then
2329                  * reset the RDIRTY flag.
2330                  */
2331                 if (error && rdirty &&
2332                     (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) {
2333                         mutex_enter(&rp->r_statelock);
2334                         rp->r_flags |= RDIRTY;
2335                         mutex_exit(&rp->r_statelock);
2336                 }
2337         } else {
2338                 /*
2339                  * Do a range from [off...off + len) looking for pages
2340                  * to deal with.
2341                  */
2342                 error = 0;
2343 #ifdef lint
2344                 io_len = 0;
2345 #endif
2346                 eoff = off + len;
2347                 mutex_enter(&rp->r_statelock);
2348                 for (io_off = off; io_off < eoff && io_off < rp->r_size;
2349                     io_off += io_len) {
2350                         mutex_exit(&rp->r_statelock);
2351                         /*
2352                          * If we are not invalidating, synchronously
2353                          * freeing or writing pages use the routine
2354                          * page_lookup_nowait() to prevent reclaiming
2355                          * them from the free list.
2356                          */
2357                         if ((flags & B_INVAL) || !(flags & B_ASYNC)) {
2358                                 pp = page_lookup(vp, io_off,
2359                                     (flags & (B_INVAL | B_FREE)) ?
2360                                     SE_EXCL : SE_SHARED);
2361                         } else {
2362                                 pp = page_lookup_nowait(vp, io_off,
2363                                     (flags & B_FREE) ? SE_EXCL : SE_SHARED);
2364                         }
2365 
2366                         if (pp == NULL || !pvn_getdirty(pp, flags))
2367                                 io_len = PAGESIZE;
2368                         else {
2369                                 err = (*rp->r_putapage)(vp, pp, &io_off,
2370                                     &io_len, flags, cr);
2371                                 if (!error)
2372                                         error = err;
2373                                 /*
2374                                  * "io_off" and "io_len" are returned as
2375                                  * the range of pages we actually wrote.
2376                                  * This allows us to skip ahead more quickly
2377                                  * since several pages may've been dealt
2378                                  * with by this iteration of the loop.
2379                                  */
2380                         }
2381                         mutex_enter(&rp->r_statelock);
2382                 }
2383                 mutex_exit(&rp->r_statelock);
2384         }
2385 
2386         return (error);
2387 }
2388 
2389 void
2390 nfs_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr)
2391 {
2392         rnode_t *rp;
2393 
2394         rp = VTOR(vp);
2395         mutex_enter(&rp->r_statelock);
2396         while (rp->r_flags & RTRUNCATE)
2397                 cv_wait(&rp->r_cv, &rp->r_statelock);
2398         rp->r_flags |= RTRUNCATE;
2399         if (off == (u_offset_t)0) {
2400                 rp->r_flags &= ~RDIRTY;
2401                 if (!(rp->r_flags & RSTALE))
2402                         rp->r_error = 0;
2403         }
2404         rp->r_truncaddr = off;
2405         mutex_exit(&rp->r_statelock);
2406         (void) pvn_vplist_dirty(vp, off, rp->r_putapage,
2407                 B_INVAL | B_TRUNC, cr);
2408         mutex_enter(&rp->r_statelock);
2409         rp->r_flags &= ~RTRUNCATE;
2410         cv_broadcast(&rp->r_cv);
2411         mutex_exit(&rp->r_statelock);
2412 }
2413 
2414 static int nfs_write_error_to_cons_only = 0;
2415 #define MSG(x)  (nfs_write_error_to_cons_only ? (x) : (x) + 1)
2416 
2417 /*
2418  * Print a file handle
2419  */
2420 void
2421 nfs_printfhandle(nfs_fhandle *fhp)
2422 {
2423         int *ip;
2424         char *buf;
2425         size_t bufsize;
2426         char *cp;
2427 
2428         /*
2429          * 13 == "(file handle:"
2430          * maximum of NFS_FHANDLE / sizeof (*ip) elements in fh_buf times
2431          *      1 == ' '
2432          *      8 == maximum strlen of "%x"
2433          * 3 == ")\n\0"
2434          */
2435         bufsize = 13 + ((NFS_FHANDLE_LEN / sizeof (*ip)) * (1 + 8)) + 3;
2436         buf = kmem_alloc(bufsize, KM_NOSLEEP);
2437         if (buf == NULL)
2438                 return;
2439 
2440         cp = buf;
2441         (void) strcpy(cp, "(file handle:");
2442         while (*cp != '\0')
2443                 cp++;
2444         for (ip = (int *)fhp->fh_buf;
2445             ip < (int *)&fhp->fh_buf[fhp->fh_len];
2446             ip++) {
2447                 (void) sprintf(cp, " %x", *ip);
2448                 while (*cp != '\0')
2449                         cp++;
2450         }
2451         (void) strcpy(cp, ")\n");
2452 
2453         zcmn_err(getzoneid(), CE_CONT, MSG("^%s"), buf);
2454 
2455         kmem_free(buf, bufsize);
2456 }
2457 
2458 /*
2459  * Notify the system administrator that an NFS write error has
2460  * occurred.
2461  */
2462 
2463 /* seconds between ENOSPC/EDQUOT messages */
2464 clock_t nfs_write_error_interval = 5;
2465 
2466 void
2467 nfs_write_error(vnode_t *vp, int error, cred_t *cr)
2468 {
2469         mntinfo_t *mi;
2470 
2471         mi = VTOMI(vp);
2472         /*
2473          * In case of forced unmount or zone shutdown, do not print any
2474          * messages since it can flood the console with error messages.
2475          */
2476         if (FS_OR_ZONE_GONE(mi->mi_vfsp))
2477                 return;
2478 
2479         /*
2480          * No use in flooding the console with ENOSPC
2481          * messages from the same file system.
2482          */
2483         if ((error != ENOSPC && error != EDQUOT) ||
2484             lbolt - mi->mi_printftime > 0) {
2485                 zoneid_t zoneid = mi->mi_zone->zone_id;
2486 
2487 #ifdef DEBUG
2488                 nfs_perror(error, "NFS%ld write error on host %s: %m.\n",
2489                     mi->mi_vers, VTOR(vp)->r_server->sv_hostname, NULL);
2490 #else
2491                 nfs_perror(error, "NFS write error on host %s: %m.\n",
2492                     VTOR(vp)->r_server->sv_hostname, NULL);
2493 #endif
2494                 if (error == ENOSPC || error == EDQUOT) {
2495                         zcmn_err(zoneid, CE_CONT,
2496                             MSG("^File: userid=%d, groupid=%d\n"),
2497                             crgetuid(cr), crgetgid(cr));
2498                         if (crgetuid(CRED()) != crgetuid(cr) ||
2499                             crgetgid(CRED()) != crgetgid(cr)) {
2500                                 zcmn_err(zoneid, CE_CONT,
2501                                     MSG("^User: userid=%d, groupid=%d\n"),
2502                                     crgetuid(CRED()), crgetgid(CRED()));
2503                         }
2504                         mi->mi_printftime = lbolt +
2505                             nfs_write_error_interval * hz;
2506                 }
2507                 nfs_printfhandle(&VTOR(vp)->r_fh);
2508 #ifdef DEBUG
2509                 if (error == EACCES) {
2510                         zcmn_err(zoneid, CE_CONT,
2511                             MSG("^nfs_bio: cred is%s kcred\n"),
2512                             cr == kcred ? "" : " not");
2513                 }
2514 #endif
2515         }
2516 }
2517 
2518 /* ARGSUSED */
2519 static void *
2520 nfs_mi_init(zoneid_t zoneid)
2521 {
2522         struct mi_globals *mig;
2523 
2524         mig = kmem_alloc(sizeof (*mig), KM_SLEEP);
2525         mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL);
2526         list_create(&mig->mig_list, sizeof (mntinfo_t),
2527             offsetof(mntinfo_t, mi_zone_node));
2528         mig->mig_destructor_called = B_FALSE;
2529         return (mig);
2530 }
2531 
2532 /*
2533  * Callback routine to tell all NFS mounts in the zone to stop creating new
2534  * threads.  Existing threads should exit.
2535  */
2536 /* ARGSUSED */
2537 static void
2538 nfs_mi_shutdown(zoneid_t zoneid, void *data)
2539 {
2540         struct mi_globals *mig = data;
2541         mntinfo_t *mi;
2542 
2543         ASSERT(mig != NULL);
2544 again:
2545         mutex_enter(&mig->mig_lock);
2546         for (mi = list_head(&mig->mig_list); mi != NULL;
2547             mi = list_next(&mig->mig_list, mi)) {
2548 
2549                 /*
2550                  * If we've done the shutdown work for this FS, skip.
2551                  * Once we go off the end of the list, we're done.
2552                  */
2553                 if (mi->mi_flags & MI_DEAD)
2554                         continue;
2555 
2556                 /*
2557                  * We will do work, so not done.  Get a hold on the FS.
2558                  */
2559                 VFS_HOLD(mi->mi_vfsp);
2560 
2561                 /*
2562                  * purge the DNLC for this filesystem
2563                  */
2564                 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
2565 
2566                 mutex_enter(&mi->mi_async_lock);
2567                 /*
2568                  * Tell existing async worker threads to exit.
2569                  */
2570                 mi->mi_max_threads = 0;
2571                 cv_broadcast(&mi->mi_async_work_cv);
2572                 /*
2573                  * Set MI_ASYNC_MGR_STOP so the async manager thread starts
2574                  * getting ready to exit when it's done with its current work.
2575                  * Also set MI_DEAD to note we've acted on this FS.
2576                  */
2577                 mutex_enter(&mi->mi_lock);
2578                 mi->mi_flags |= (MI_ASYNC_MGR_STOP|MI_DEAD);
2579                 mutex_exit(&mi->mi_lock);
2580                 /*
2581                  * Wake up the async manager thread.
2582                  */
2583                 cv_broadcast(&mi->mi_async_reqs_cv);
2584                 mutex_exit(&mi->mi_async_lock);
2585 
2586                 /*
2587                  * Drop lock and release FS, which may change list, then repeat.
2588                  * We're done when every mi has been done or the list is empty.
2589                  */
2590                 mutex_exit(&mig->mig_lock);
2591                 VFS_RELE(mi->mi_vfsp);
2592                 goto again;
2593         }
2594         mutex_exit(&mig->mig_lock);
2595 }
2596 
2597 static void
2598 nfs_mi_free_globals(struct mi_globals *mig)
2599 {
2600         list_destroy(&mig->mig_list);    /* makes sure the list is empty */
2601         mutex_destroy(&mig->mig_lock);
2602         kmem_free(mig, sizeof (*mig));
2603 
2604 }
2605 
2606 /* ARGSUSED */
2607 static void
2608 nfs_mi_destroy(zoneid_t zoneid, void *data)
2609 {
2610         struct mi_globals *mig = data;
2611 
2612         ASSERT(mig != NULL);
2613         mutex_enter(&mig->mig_lock);
2614         if (list_head(&mig->mig_list) != NULL) {
2615                 /* Still waiting for VFS_FREEVFS() */
2616                 mig->mig_destructor_called = B_TRUE;
2617                 mutex_exit(&mig->mig_lock);
2618                 return;
2619         }
2620         nfs_mi_free_globals(mig);
2621 }
2622 
2623 /*
2624  * Add an NFS mount to the per-zone list of NFS mounts.
2625  */
2626 void
2627 nfs_mi_zonelist_add(mntinfo_t *mi)
2628 {
2629         struct mi_globals *mig;
2630 
2631         mig = zone_getspecific(mi_list_key, mi->mi_zone);
2632         mutex_enter(&mig->mig_lock);
2633         list_insert_head(&mig->mig_list, mi);
2634         mutex_exit(&mig->mig_lock);
2635 }
2636 
2637 /*
2638  * Remove an NFS mount from the per-zone list of NFS mounts.
2639  */
2640 static void
2641 nfs_mi_zonelist_remove(mntinfo_t *mi)
2642 {
2643         struct mi_globals *mig;
2644 
2645         mig = zone_getspecific(mi_list_key, mi->mi_zone);
2646         mutex_enter(&mig->mig_lock);
2647         list_remove(&mig->mig_list, mi);
2648         /*
2649          * We can be called asynchronously by VFS_FREEVFS() after the zone
2650          * shutdown/destroy callbacks have executed; if so, clean up the zone's
2651          * mi globals.
2652          */
2653         if (list_head(&mig->mig_list) == NULL &&
2654             mig->mig_destructor_called == B_TRUE) {
2655                 nfs_mi_free_globals(mig);
2656                 return;
2657         }
2658         mutex_exit(&mig->mig_lock);
2659 }
2660 
2661 /*
2662  * NFS Client initialization routine.  This routine should only be called
2663  * once.  It performs the following tasks:
2664  *      - Initalize all global locks
2665  *      - Call sub-initialization routines (localize access to variables)
2666  */
2667 int
2668 nfs_clntinit(void)
2669 {
2670 #ifdef DEBUG
2671         static boolean_t nfs_clntup = B_FALSE;
2672 #endif
2673         int error;
2674 
2675 #ifdef DEBUG
2676         ASSERT(nfs_clntup == B_FALSE);
2677 #endif
2678 
2679         error = nfs_subrinit();
2680         if (error)
2681                 return (error);
2682 
2683         error = nfs_vfsinit();
2684         if (error) {
2685                 /*
2686                  * Cleanup nfs_subrinit() work
2687                  */
2688                 nfs_subrfini();
2689                 return (error);
2690         }
2691         zone_key_create(&mi_list_key, nfs_mi_init, nfs_mi_shutdown,
2692             nfs_mi_destroy);
2693 
2694         x_READ3args = xdr_READ3args;
2695         x_READ3res = xdr_READ3res;
2696         x_READ3vres = xdr_READ3vres;
2697         x_READ3uiores = xdr_READ3uiores;
2698 
2699         nfs4_clnt_init();
2700 
2701 #ifdef DEBUG
2702         nfs_clntup = B_TRUE;
2703 #endif
2704 
2705         return (0);
2706 }
2707 
2708 /*
2709  * This routine is only called if the NFS Client has been initialized but
2710  * the module failed to be installed. This routine will cleanup the previously
2711  * allocated/initialized work.
2712  */
2713 void
2714 nfs_clntfini(void)
2715 {
2716         (void) zone_key_delete(mi_list_key);
2717         nfs_subrfini();
2718         nfs_vfsfini();
2719         nfs4_clnt_fini();
2720 }
2721 
2722 /*
2723  * nfs_lockrelease:
2724  *
2725  * Release any locks on the given vnode that are held by the current
2726  * process.
2727  */
2728 void
2729 nfs_lockrelease(vnode_t *vp, int flag, offset_t offset, cred_t *cr)
2730 {
2731         flock64_t ld;
2732         struct shrlock shr;
2733         char *buf;
2734         int remote_lock_possible;
2735         int ret;
2736 
2737         ASSERT((uintptr_t)vp > KERNELBASE);
2738 
2739         /*
2740          * Generate an explicit unlock operation for the entire file.  As a
2741          * partial optimization, only generate the unlock if there is a
2742          * lock registered for the file.  We could check whether this
2743          * particular process has any locks on the file, but that would
2744          * require the local locking code to provide yet another query
2745          * routine.  Note that no explicit synchronization is needed here.
2746          * At worst, flk_has_remote_locks() will return a false positive,
2747          * in which case the unlock call wastes time but doesn't harm
2748          * correctness.
2749          *
2750          * In addition, an unlock request is generated if the process
2751          * is listed as possibly having a lock on the file because the
2752          * server and client lock managers may have gotten out of sync.
2753          * N.B. It is important to make sure nfs_remove_locking_id() is
2754          * called here even if flk_has_remote_locks(vp) reports true.
2755          * If it is not called and there is an entry on the process id
2756          * list, that entry will never get removed.
2757          */
2758         remote_lock_possible = nfs_remove_locking_id(vp, RLMPL_PID,
2759             (char *)&(ttoproc(curthread)->p_pid), NULL, NULL);
2760         if (remote_lock_possible || flk_has_remote_locks(vp)) {
2761                 ld.l_type = F_UNLCK;    /* set to unlock entire file */
2762                 ld.l_whence = 0;        /* unlock from start of file */
2763                 ld.l_start = 0;
2764                 ld.l_len = 0;           /* do entire file */
2765                 ret = VOP_FRLOCK(vp, F_SETLK, &ld, flag, offset, NULL, cr);
2766 
2767                 if (ret != 0) {
2768                         /*
2769                          * If VOP_FRLOCK fails, make sure we unregister
2770                          * local locks before we continue.
2771                          */
2772                         ld.l_pid = ttoproc(curthread)->p_pid;
2773                         lm_register_lock_locally(vp, NULL, &ld, flag, offset);
2774 #ifdef DEBUG
2775                         nfs_perror(ret,
2776                             "NFS lock release error on vp %p: %m.\n",
2777                             (void *)vp, NULL);
2778 #endif
2779                 }
2780 
2781                 /*
2782                  * The call to VOP_FRLOCK may put the pid back on the
2783                  * list.  We need to remove it.
2784                  */
2785                 (void) nfs_remove_locking_id(vp, RLMPL_PID,
2786                     (char *)&(ttoproc(curthread)->p_pid), NULL, NULL);
2787         }
2788 
2789         /*
2790          * As long as the vp has a share matching our pid,
2791          * pluck it off and unshare it.  There are circumstances in
2792          * which the call to nfs_remove_locking_id() may put the
2793          * owner back on the list, in which case we simply do a
2794          * redundant and harmless unshare.
2795          */
2796         buf = kmem_alloc(MAX_SHR_OWNER_LEN, KM_SLEEP);
2797         while (nfs_remove_locking_id(vp, RLMPL_OWNER,
2798             (char *)NULL, buf, &shr.s_own_len)) {
2799                 shr.s_owner = buf;
2800                 shr.s_access = 0;
2801                 shr.s_deny = 0;
2802                 shr.s_sysid = 0;
2803                 shr.s_pid = curproc->p_pid;
2804 
2805                 ret = VOP_SHRLOCK(vp, F_UNSHARE, &shr, flag, cr);
2806 #ifdef DEBUG
2807                 if (ret != 0) {
2808                         nfs_perror(ret,
2809                             "NFS share release error on vp %p: %m.\n",
2810                             (void *)vp, NULL);
2811                 }
2812 #endif
2813         }
2814         kmem_free(buf, MAX_SHR_OWNER_LEN);
2815 }
2816 
2817 /*
2818  * nfs_lockcompletion:
2819  *
2820  * If the vnode has a lock that makes it unsafe to cache the file, mark it
2821  * as non cachable (set VNOCACHE bit).
2822  */
2823 
2824 void
2825 nfs_lockcompletion(vnode_t *vp, int cmd)
2826 {
2827 #ifdef DEBUG
2828         rnode_t *rp = VTOR(vp);
2829 
2830         ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
2831 #endif
2832 
2833         if (cmd == F_SETLK || cmd == F_SETLKW) {
2834                 if (!lm_safemap(vp)) {
2835                         mutex_enter(&vp->v_lock);
2836                         vp->v_flag |= VNOCACHE;
2837                         mutex_exit(&vp->v_lock);
2838                 } else {
2839                         mutex_enter(&vp->v_lock);
2840                         vp->v_flag &= ~VNOCACHE;
2841                         mutex_exit(&vp->v_lock);
2842                 }
2843         }
2844         /*
2845          * The cached attributes of the file are stale after acquiring
2846          * the lock on the file. They were updated when the file was
2847          * opened, but not updated when the lock was acquired. Therefore the
2848          * cached attributes are invalidated after the lock is obtained.
2849          */
2850         PURGE_ATTRCACHE(vp);
2851 }
2852 
2853 /*
2854  * The lock manager holds state making it possible for the client
2855  * and server to be out of sync.  For example, if the response from
2856  * the server granting a lock request is lost, the server will think
2857  * the lock is granted and the client will think the lock is lost.
2858  * The client can tell when it is not positive if it is in sync with
2859  * the server.
2860  *
2861  * To deal with this, a list of processes for which the client is
2862  * not sure if the server holds a lock is attached to the rnode.
2863  * When such a process closes the rnode, an unlock request is sent
2864  * to the server to unlock the entire file.
2865  *
2866  * The list is kept as a singularly linked NULL terminated list.
2867  * Because it is only added to under extreme error conditions, the
2868  * list shouldn't get very big.  DEBUG kernels print a message if
2869  * the list gets bigger than nfs_lmpl_high_water.  This is arbitrarily
2870  * choosen to be 8, but can be tuned at runtime.
2871  */
2872 #ifdef DEBUG
2873 /* int nfs_lmpl_high_water = 8; */
2874 int nfs_lmpl_high_water = 128;
2875 int nfs_cnt_add_locking_id = 0;
2876 int nfs_len_add_locking_id = 0;
2877 #endif /* DEBUG */
2878 
2879 /*
2880  * Record that the nfs lock manager server may be holding a lock on
2881  * a vnode for a process.
2882  *
2883  * Because the nfs lock manager server holds state, it is possible
2884  * for the server to get out of sync with the client.  This routine is called
2885  * from the client when it is no longer sure if the server is in sync
2886  * with the client.  nfs_lockrelease() will then notice this and send
2887  * an unlock request when the file is closed
2888  */
2889 void
2890 nfs_add_locking_id(vnode_t *vp, pid_t pid, int type, char *id, int len)
2891 {
2892         rnode_t *rp;
2893         lmpl_t *new;
2894         lmpl_t *cur;
2895         lmpl_t **lmplp;
2896 #ifdef DEBUG
2897         int list_len = 1;
2898 #endif /* DEBUG */
2899 
2900 #ifdef DEBUG
2901         ++nfs_cnt_add_locking_id;
2902 #endif /* DEBUG */
2903         /*
2904          * allocate new lmpl_t now so we don't sleep
2905          * later after grabbing mutexes
2906          */
2907         ASSERT(len < MAX_SHR_OWNER_LEN);
2908         new = kmem_alloc(sizeof (*new), KM_SLEEP);
2909         new->lmpl_type = type;
2910         new->lmpl_pid = pid;
2911         new->lmpl_owner = kmem_alloc(len, KM_SLEEP);
2912         bcopy(id, new->lmpl_owner, len);
2913         new->lmpl_own_len = len;
2914         new->lmpl_next = (lmpl_t *)NULL;
2915 #ifdef DEBUG
2916         if (type == RLMPL_PID) {
2917                 ASSERT(len == sizeof (pid_t));
2918                 ASSERT(pid == *(pid_t *)new->lmpl_owner);
2919         } else {
2920                 ASSERT(type == RLMPL_OWNER);
2921         }
2922 #endif
2923 
2924         rp = VTOR(vp);
2925         mutex_enter(&rp->r_statelock);
2926 
2927         /*
2928          * Add this id to the list for this rnode only if the
2929          * rnode is active and the id is not already there.
2930          */
2931         ASSERT(rp->r_flags & RHASHED);
2932         lmplp = &(rp->r_lmpl);
2933         for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; cur = cur->lmpl_next) {
2934                 if (cur->lmpl_pid == pid &&
2935                     cur->lmpl_type == type &&
2936                     cur->lmpl_own_len == len &&
2937                     bcmp(cur->lmpl_owner, new->lmpl_owner, len) == 0) {
2938                         kmem_free(new->lmpl_owner, len);
2939                         kmem_free(new, sizeof (*new));
2940                         break;
2941                 }
2942                 lmplp = &cur->lmpl_next;
2943 #ifdef DEBUG
2944                 ++list_len;
2945 #endif /* DEBUG */
2946         }
2947         if (cur == (lmpl_t *)NULL) {
2948                 *lmplp = new;
2949 #ifdef DEBUG
2950                 if (list_len > nfs_len_add_locking_id) {
2951                         nfs_len_add_locking_id = list_len;
2952                 }
2953                 if (list_len > nfs_lmpl_high_water) {
2954                         cmn_err(CE_WARN, "nfs_add_locking_id: long list "
2955                             "vp=%p is %d", (void *)vp, list_len);
2956                 }
2957 #endif /* DEBUG */
2958         }
2959 
2960 #ifdef DEBUG
2961         if (share_debug) {
2962                 int nitems = 0;
2963                 int npids = 0;
2964                 int nowners = 0;
2965 
2966                 /*
2967                  * Count the number of things left on r_lmpl after the remove.
2968                  */
2969                 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL;
2970                     cur = cur->lmpl_next) {
2971                         nitems++;
2972                         if (cur->lmpl_type == RLMPL_PID) {
2973                                 npids++;
2974                         } else if (cur->lmpl_type == RLMPL_OWNER) {
2975                                 nowners++;
2976                         } else {
2977                                 cmn_err(CE_PANIC, "nfs_add_locking_id: "
2978                                     "unrecognised lmpl_type %d",
2979                                     cur->lmpl_type);
2980                         }
2981                 }
2982 
2983                 cmn_err(CE_CONT, "nfs_add_locking_id(%s): %d PIDs + %d "
2984                     "OWNs = %d items left on r_lmpl\n",
2985                     (type == RLMPL_PID) ? "P" : "O", npids, nowners, nitems);
2986         }
2987 #endif
2988 
2989         mutex_exit(&rp->r_statelock);
2990 }
2991 
2992 /*
2993  * Remove an id from the lock manager id list.
2994  *
2995  * If the id is not in the list return 0.  If it was found and
2996  * removed, return 1.
2997  */
2998 static int
2999 nfs_remove_locking_id(vnode_t *vp, int type, char *id, char *rid, int *rlen)
3000 {
3001         lmpl_t *cur;
3002         lmpl_t **lmplp;
3003         rnode_t *rp;
3004         int rv = 0;
3005 
3006         ASSERT(type == RLMPL_PID || type == RLMPL_OWNER);
3007 
3008         rp = VTOR(vp);
3009 
3010         mutex_enter(&rp->r_statelock);
3011         ASSERT(rp->r_flags & RHASHED);
3012         lmplp = &(rp->r_lmpl);
3013 
3014         /*
3015          * Search through the list and remove the entry for this id
3016          * if it is there.  The special case id == NULL allows removal
3017          * of the first share on the r_lmpl list belonging to the
3018          * current process (if any), without regard to further details
3019          * of its identity.
3020          */
3021         for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; cur = cur->lmpl_next) {
3022                 if (cur->lmpl_type == type &&
3023                     cur->lmpl_pid == curproc->p_pid &&
3024                     (id == (char *)NULL ||
3025                     bcmp(cur->lmpl_owner, id, cur->lmpl_own_len) == 0)) {
3026                         *lmplp = cur->lmpl_next;
3027                         ASSERT(cur->lmpl_own_len < MAX_SHR_OWNER_LEN);
3028                         if (rid != NULL) {
3029                                 bcopy(cur->lmpl_owner, rid, cur->lmpl_own_len);
3030                                 *rlen = cur->lmpl_own_len;
3031                         }
3032                         kmem_free(cur->lmpl_owner, cur->lmpl_own_len);
3033                         kmem_free(cur, sizeof (*cur));
3034                         rv = 1;
3035                         break;
3036                 }
3037                 lmplp = &cur->lmpl_next;
3038         }
3039 
3040 #ifdef DEBUG
3041         if (share_debug) {
3042                 int nitems = 0;
3043                 int npids = 0;
3044                 int nowners = 0;
3045 
3046                 /*
3047                  * Count the number of things left on r_lmpl after the remove.
3048                  */
3049                 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL;
3050                                 cur = cur->lmpl_next) {
3051                         nitems++;
3052                         if (cur->lmpl_type == RLMPL_PID) {
3053                                 npids++;
3054                         } else if (cur->lmpl_type == RLMPL_OWNER) {
3055                                 nowners++;
3056                         } else {
3057                                 cmn_err(CE_PANIC,
3058                                         "nrli: unrecognised lmpl_type %d",
3059                                         cur->lmpl_type);
3060                         }
3061                 }
3062 
3063                 cmn_err(CE_CONT,
3064                 "nrli(%s): %d PIDs + %d OWNs = %d items left on r_lmpl\n",
3065                         (type == RLMPL_PID) ? "P" : "O",
3066                         npids,
3067                         nowners,
3068                         nitems);
3069         }
3070 #endif
3071 
3072         mutex_exit(&rp->r_statelock);
3073         return (rv);
3074 }
3075 
3076 void
3077 nfs_free_mi(mntinfo_t *mi)
3078 {
3079         ASSERT(mi->mi_flags & MI_ASYNC_MGR_STOP);
3080         ASSERT(mi->mi_manager_thread == NULL);
3081         ASSERT(mi->mi_threads == 0);
3082 
3083         /*
3084          * Remove the node from the global list before we start tearing it down.
3085          */
3086         nfs_mi_zonelist_remove(mi);
3087         if (mi->mi_klmconfig) {
3088                 lm_free_config(mi->mi_klmconfig);
3089                 kmem_free(mi->mi_klmconfig, sizeof (struct knetconfig));
3090         }
3091         mutex_destroy(&mi->mi_lock);
3092         mutex_destroy(&mi->mi_remap_lock);
3093         mutex_destroy(&mi->mi_async_lock);
3094         cv_destroy(&mi->mi_failover_cv);
3095         cv_destroy(&mi->mi_async_work_cv);
3096         cv_destroy(&mi->mi_async_reqs_cv);
3097         cv_destroy(&mi->mi_async_cv);
3098         zone_rele(mi->mi_zone);
3099         kmem_free(mi, sizeof (*mi));
3100 }
3101 
3102 static int
3103 mnt_kstat_update(kstat_t *ksp, int rw)
3104 {
3105         mntinfo_t *mi;
3106         struct mntinfo_kstat *mik;
3107         vfs_t *vfsp;
3108         int i;
3109 
3110         /* this is a read-only kstat. Bail out on a write */
3111         if (rw == KSTAT_WRITE)
3112                 return (EACCES);
3113 
3114         /*
3115          * We don't want to wait here as kstat_chain_lock could be held by
3116          * dounmount(). dounmount() takes vfs_reflock before the chain lock
3117          * and thus could lead to a deadlock.
3118          */
3119         vfsp = (struct vfs *)ksp->ks_private;
3120 
3121 
3122         mi = VFTOMI(vfsp);
3123 
3124         mik = (struct mntinfo_kstat *)ksp->ks_data;
3125 
3126         (void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto);
3127         mik->mik_vers = (uint32_t)mi->mi_vers;
3128         mik->mik_flags = mi->mi_flags;
3129         mik->mik_secmod = mi->mi_curr_serv->sv_secdata->secmod;
3130         mik->mik_curread = (uint32_t)mi->mi_curread;
3131         mik->mik_curwrite = (uint32_t)mi->mi_curwrite;
3132         mik->mik_retrans = mi->mi_retrans;
3133         mik->mik_timeo = mi->mi_timeo;
3134         mik->mik_acregmin = HR2SEC(mi->mi_acregmin);
3135         mik->mik_acregmax = HR2SEC(mi->mi_acregmax);
3136         mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin);
3137         mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax);
3138         for (i = 0; i < NFS_CALLTYPES + 1; i++) {
3139                 mik->mik_timers[i].srtt = (uint32_t)mi->mi_timers[i].rt_srtt;
3140                 mik->mik_timers[i].deviate =
3141                     (uint32_t)mi->mi_timers[i].rt_deviate;
3142                 mik->mik_timers[i].rtxcur =
3143                     (uint32_t)mi->mi_timers[i].rt_rtxcur;
3144         }
3145         mik->mik_noresponse = (uint32_t)mi->mi_noresponse;
3146         mik->mik_failover = (uint32_t)mi->mi_failover;
3147         mik->mik_remap = (uint32_t)mi->mi_remap;
3148         (void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname);
3149 
3150         return (0);
3151 }
3152 
3153 void
3154 nfs_mnt_kstat_init(struct vfs *vfsp)
3155 {
3156         mntinfo_t *mi = VFTOMI(vfsp);
3157 
3158         /*
3159          * Create the version specific kstats.
3160          *
3161          * PSARC 2001/697 Contract Private Interface
3162          * All nfs kstats are under SunMC contract
3163          * Please refer to the PSARC listed above and contact
3164          * SunMC before making any changes!
3165          *
3166          * Changes must be reviewed by Solaris File Sharing
3167          * Changes must be communicated to contract-2001-697@sun.com
3168          *
3169          */
3170 
3171         mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev),
3172             NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id);
3173         if (mi->mi_io_kstats) {
3174                 if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
3175                         kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID);
3176                 mi->mi_io_kstats->ks_lock = &mi->mi_lock;
3177                 kstat_install(mi->mi_io_kstats);
3178         }
3179 
3180         if ((mi->mi_ro_kstats = kstat_create_zone("nfs",
3181             getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW,
3182             sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) {
3183                 if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
3184                         kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID);
3185                 mi->mi_ro_kstats->ks_update = mnt_kstat_update;
3186                 mi->mi_ro_kstats->ks_private = (void *)vfsp;
3187                 kstat_install(mi->mi_ro_kstats);
3188         }
3189 }
3190 
3191 nfs_delmapcall_t *
3192 nfs_init_delmapcall()
3193 {
3194         nfs_delmapcall_t        *delmap_call;
3195 
3196         delmap_call = kmem_alloc(sizeof (nfs_delmapcall_t), KM_SLEEP);
3197         delmap_call->call_id = curthread;
3198         delmap_call->error = 0;
3199 
3200         return (delmap_call);
3201 }
3202 
3203 void
3204 nfs_free_delmapcall(nfs_delmapcall_t *delmap_call)
3205 {
3206         kmem_free(delmap_call, sizeof (nfs_delmapcall_t));
3207 }
3208 
3209 /*
3210  * Searches for the current delmap caller (based on curthread) in the list of
3211  * callers.  If it is found, we remove it and free the delmap caller.
3212  * Returns:
3213  *      0 if the caller wasn't found
3214  *      1 if the caller was found, removed and freed.  *errp is set to what
3215  *      the result of the delmap was.
3216  */
3217 int
3218 nfs_find_and_delete_delmapcall(rnode_t *rp, int *errp)
3219 {
3220         nfs_delmapcall_t        *delmap_call;
3221 
3222         /*
3223          * If the list doesn't exist yet, we create it and return
3224          * that the caller wasn't found.  No list = no callers.
3225          */
3226         mutex_enter(&rp->r_statelock);
3227         if (!(rp->r_flags & RDELMAPLIST)) {
3228                 /* The list does not exist */
3229                 list_create(&rp->r_indelmap, sizeof (nfs_delmapcall_t),
3230                     offsetof(nfs_delmapcall_t, call_node));
3231                 rp->r_flags |= RDELMAPLIST;
3232                 mutex_exit(&rp->r_statelock);
3233                 return (0);
3234         } else {
3235                 /* The list exists so search it */
3236                 for (delmap_call = list_head(&rp->r_indelmap);
3237                     delmap_call != NULL;
3238                     delmap_call = list_next(&rp->r_indelmap, delmap_call)) {
3239                         if (delmap_call->call_id == curthread) {
3240                                 /* current caller is in the list */
3241                                 *errp = delmap_call->error;
3242                                 list_remove(&rp->r_indelmap, delmap_call);
3243                                 mutex_exit(&rp->r_statelock);
3244                                 nfs_free_delmapcall(delmap_call);
3245                                 return (1);
3246                         }
3247                 }
3248         }
3249         mutex_exit(&rp->r_statelock);
3250         return (0);
3251 }