Old nfs_client.c
  1 /*
  2  * CDDL HEADER START
  3  *
  4  * The contents of this file are subject to the terms of the
  5  * Common Development and Distribution License, Version 1.0 only
  6  * (the "License").  You may not use this file except in compliance
  7  * with the License.
  8  *
  9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 10  * or http://www.opensolaris.org/os/licensing.
 11  * See the License for the specific language governing permissions
 12  * and limitations under the License.
 13  *
 14  * When distributing Covered Code, include this CDDL HEADER in each
 15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 16  * If applicable, add the following below this CDDL HEADER, with the
 17  * fields enclosed by brackets "[]" replaced with your own identifying
 18  * information: Portions Copyright [yyyy] [name of copyright owner]
 19  *
 20  * CDDL HEADER END
 21  */
 22 /*
 23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
 24  * Use is subject to license terms.
 25  *
 26  *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
 27  *      All rights reserved.
 28  */
 29 
 30 #pragma ident   "@(#)nfs_client.c       1.194   05/12/08 SMI"
 31 
 32 #include <sys/param.h>
 33 #include <sys/types.h>
 34 #include <sys/systm.h>
 35 #include <sys/thread.h>
 36 #include <sys/t_lock.h>
 37 #include <sys/time.h>
 38 #include <sys/vnode.h>
 39 #include <sys/vfs.h>
 40 #include <sys/errno.h>
 41 #include <sys/buf.h>
 42 #include <sys/stat.h>
 43 #include <sys/cred.h>
 44 #include <sys/kmem.h>
 45 #include <sys/debug.h>
 46 #include <sys/dnlc.h>
 47 #include <sys/vmsystm.h>
 48 #include <sys/flock.h>
 49 #include <sys/share.h>
 50 #include <sys/cmn_err.h>
 51 #include <sys/tiuser.h>
 52 #include <sys/sysmacros.h>
 53 #include <sys/callb.h>
 54 #include <sys/acl.h>
 55 #include <sys/kstat.h>
 56 #include <sys/signal.h>
 57 #include <sys/list.h>
 58 #include <sys/zone.h>
 59 
 60 #include <rpc/types.h>
 61 #include <rpc/xdr.h>
 62 #include <rpc/auth.h>
 63 #include <rpc/clnt.h>
 64 
 65 #include <nfs/nfs.h>
 66 #include <nfs/nfs_clnt.h>
 67 
 68 #include <nfs/rnode.h>
 69 #include <nfs/nfs_acl.h>
 70 #include <nfs/lm.h>
 71 
 72 #include <vm/hat.h>
 73 #include <vm/as.h>
 74 #include <vm/page.h>
 75 #include <vm/pvn.h>
 76 #include <vm/seg.h>
 77 #include <vm/seg_map.h>
 78 #include <vm/seg_vn.h>
 79 
 80 static void     nfs3_attr_cache(vnode_t *, vattr_t *, vattr_t *, hrtime_t,
 81                         cred_t *);
 82 static int      nfs_getattr_cache(vnode_t *, struct vattr *);
 83 static int      nfs_remove_locking_id(vnode_t *, int, char *, char *, int *);
 84 
 85 struct mi_globals {
 86         kmutex_t        mig_lock;  /* lock protecting mig_list */
 87         list_t          mig_list;  /* list of NFS v2 or v3 mounts in zone */
 88         boolean_t       mig_destructor_called;
 89 };
 90 
 91 static zone_key_t mi_list_key;
 92 
 93 /* Debugging flag for PC file shares. */
 94 extern int      share_debug;
 95 
 96 /*
 97  * Attributes caching:
 98  *
 99  * Attributes are cached in the rnode in struct vattr form.
100  * There is a time associated with the cached attributes (r_attrtime)
101  * which tells whether the attributes are valid. The time is initialized
102  * to the difference between current time and the modify time of the vnode
103  * when new attributes are cached. This allows the attributes for
104  * files that have changed recently to be timed out sooner than for files
105  * that have not changed for a long time. There are minimum and maximum
106  * timeout values that can be set per mount point.
107  */
108 
109 int
110 nfs_waitfor_purge_complete(vnode_t *vp)
111 {
112         rnode_t *rp;
113         k_sigset_t smask;
114 
115         rp = VTOR(vp);
116         if (rp->r_serial != NULL && rp->r_serial != curthread) {
117                 mutex_enter(&rp->r_statelock);
118                 sigintr(&smask, VTOMI(vp)->mi_flags & MI_INT);
119                 while (rp->r_serial != NULL) {
120                         if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
121                                 sigunintr(&smask);
122                                 mutex_exit(&rp->r_statelock);
123                                 return (EINTR);
124                         }
125                 }
126                 sigunintr(&smask);
127                 mutex_exit(&rp->r_statelock);
128         }
129         return (0);
130 }
131 
132 /*
133  * Validate caches by checking cached attributes. If the cached
134  * attributes have timed out, then get new attributes from the server.
135  * As a side affect, this will do cache invalidation if the attributes
136  * have changed.
137  *
138  * If the attributes have not timed out and if there is a cache
139  * invalidation being done by some other thread, then wait until that
140  * thread has completed the cache invalidation.
141  */
142 int
143 nfs_validate_caches(vnode_t *vp, cred_t *cr)
144 {
145         int error;
146         struct vattr va;
147 
148         if (ATTRCACHE_VALID(vp)) {
149                 error = nfs_waitfor_purge_complete(vp);
150                 if (error)
151                         return (error);
152                 return (0);
153         }
154 
155         va.va_mask = AT_ALL;
156         return (nfs_getattr_otw(vp, &va, cr));
157 }
158 
159 /*
160  * Validate caches by checking cached attributes. If the cached
161  * attributes have timed out, then get new attributes from the server.
162  * As a side affect, this will do cache invalidation if the attributes
163  * have changed.
164  *
165  * If the attributes have not timed out and if there is a cache
166  * invalidation being done by some other thread, then wait until that
167  * thread has completed the cache invalidation.
168  */
169 int
170 nfs3_validate_caches(vnode_t *vp, cred_t *cr)
171 {
172         int error;
173         struct vattr va;
174 
175         if (ATTRCACHE_VALID(vp)) {
176                 error = nfs_waitfor_purge_complete(vp);
177                 if (error)
178                         return (error);
179                 return (0);
180         }
181 
182         va.va_mask = AT_ALL;
183         return (nfs3_getattr_otw(vp, &va, cr));
184 }
185 
186 /*
187  * Purge all of the various NFS `data' caches.
188  */
189 void
190 nfs_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr)
191 {
192         rnode_t *rp;
193         char *contents;
194         int size;
195         int error;
196 
197         /*
198          * Purge the DNLC for any entries which refer to this file.
199          * Avoid recursive entry into dnlc_purge_vp() in case of a directory.
200          */
201         rp = VTOR(vp);
202         mutex_enter(&rp->r_statelock);
203         if (vp->v_count > 1 &&
204             (vp->v_type == VDIR || purge_dnlc == NFS_PURGE_DNLC) &&
205             !(rp->r_flags & RINDNLCPURGE)) {
206                 /*
207                  * Set the RINDNLCPURGE flag to prevent recursive entry
208                  * into dnlc_purge_vp()
209                  */
210                 if (vp->v_type == VDIR)
211                         rp->r_flags |= RINDNLCPURGE;
212                 mutex_exit(&rp->r_statelock);
213                 dnlc_purge_vp(vp);
214                 mutex_enter(&rp->r_statelock);
215                 if (rp->r_flags & RINDNLCPURGE)
216                         rp->r_flags &= ~RINDNLCPURGE;
217         }
218 
219         /*
220          * Clear any readdir state bits and purge the readlink response cache.
221          */
222         contents = rp->r_symlink.contents;
223         size = rp->r_symlink.size;
224         rp->r_symlink.contents = NULL;
225         mutex_exit(&rp->r_statelock);
226 
227         if (contents != NULL) {
228 
229                 kmem_free((void *)contents, size);
230         }
231 
232         /*
233          * Flush the page cache.
234          */
235         if (vn_has_cached_data(vp)) {
236                 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr);
237                 if (error && (error == ENOSPC || error == EDQUOT)) {
238                         mutex_enter(&rp->r_statelock);
239                         if (!rp->r_error)
240                                 rp->r_error = error;
241                         mutex_exit(&rp->r_statelock);
242                 }
243         }
244 
245         /*
246          * Flush the readdir response cache.
247          */
248         if (HAVE_RDDIR_CACHE(rp))
249                 nfs_purge_rddir_cache(vp);
250 }
251 
252 /*
253  * Purge the readdir cache of all entries
254  */
255 void
256 nfs_purge_rddir_cache(vnode_t *vp)
257 {
258         rnode_t *rp;
259         rddir_cache *rdc;
260         rddir_cache *nrdc;
261 
262         rp = VTOR(vp);
263 top:
264         mutex_enter(&rp->r_statelock);
265         rp->r_direof = NULL;
266         rp->r_flags &= ~RLOOKUP;
267         rp->r_flags |= RREADDIRPLUS;
268         rdc = avl_first(&rp->r_dir);
269         while (rdc != NULL) {
270                 nrdc = AVL_NEXT(&rp->r_dir, rdc);
271                 avl_remove(&rp->r_dir, rdc);
272                 rddir_cache_rele(rdc);
273                 rdc = nrdc;
274         }
275         mutex_exit(&rp->r_statelock);
276 }
277 
278 /*
279  * Do a cache check based on the post-operation attributes.
280  * Then make them the new cached attributes.  If no attributes
281  * were returned, then mark the attributes as timed out.
282  */
283 void
284 nfs3_cache_post_op_attr(vnode_t *vp, post_op_attr *poap, hrtime_t t, cred_t *cr)
285 {
286         vattr_t attr;
287 
288         if (!poap->attributes) {
289                 PURGE_ATTRCACHE(vp);
290                 return;
291         }
292         (void) nfs3_cache_fattr3(vp, &poap->attr, &attr, t, cr);
293 }
294 
295 /*
296  * Same as above, but using a vattr
297  */
298 void
299 nfs3_cache_post_op_vattr(vnode_t *vp, post_op_vattr *poap, hrtime_t t,
300     cred_t *cr)
301 {
302         if (!poap->attributes) {
303                 PURGE_ATTRCACHE(vp);
304                 return;
305         }
306         nfs_attr_cache(vp, poap->fres.vap, t, cr);
307 }
308 
309 /*
310  * Do a cache check based on the weak cache consistency attributes.
311  * These consist of a small set of pre-operation attributes and the
312  * full set of post-operation attributes.
313  *
314  * If we are given the pre-operation attributes, then use them to
315  * check the validity of the various caches.  Then, if we got the
316  * post-operation attributes, make them the new cached attributes.
317  * If we didn't get the post-operation attributes, then mark the
318  * attribute cache as timed out so that the next reference will
319  * cause a GETATTR to the server to refresh with the current
320  * attributes.
321  *
322  * Otherwise, if we didn't get the pre-operation attributes, but
323  * we did get the post-operation attributes, then use these
324  * attributes to check the validity of the various caches.  This
325  * will probably cause a flush of the caches because if the
326  * operation succeeded, the attributes of the object were changed
327  * in some way from the old post-operation attributes.  This
328  * should be okay because it is the safe thing to do.  After
329  * checking the data caches, then we make these the new cached
330  * attributes.
331  *
332  * Otherwise, we didn't get either the pre- or post-operation
333  * attributes.  Simply mark the attribute cache as timed out so
334  * the next reference will cause a GETATTR to the server to
335  * refresh with the current attributes.
336  *
337  * If an error occurred trying to convert the over the wire
338  * attributes to a vattr, then simply mark the attribute cache as
339  * timed out.
340  */
341 void
342 nfs3_cache_wcc_data(vnode_t *vp, wcc_data *wccp, hrtime_t t, cred_t *cr)
343 {
344         vattr_t bva;
345         vattr_t ava;
346 
347         if (wccp->after.attributes) {
348                 if (fattr3_to_vattr(vp, &wccp->after.attr, &ava)) {
349                         PURGE_ATTRCACHE(vp);
350                         return;
351                 }
352                 if (wccp->before.attributes) {
353                         bva.va_ctime.tv_sec = wccp->before.attr.ctime.seconds;
354                         bva.va_ctime.tv_nsec = wccp->before.attr.ctime.nseconds;
355                         bva.va_mtime.tv_sec = wccp->before.attr.mtime.seconds;
356                         bva.va_mtime.tv_nsec = wccp->before.attr.mtime.nseconds;
357                         bva.va_size = wccp->before.attr.size;
358                         nfs3_attr_cache(vp, &bva, &ava, t, cr);
359                 } else
360                         nfs_attr_cache(vp, &ava, t, cr);
361         } else {
362                 PURGE_ATTRCACHE(vp);
363         }
364 }
365 
366 /*
367  * Set attributes cache for given vnode using nfsattr.
368  *
369  * This routine does not do cache validation with the attributes.
370  *
371  * If an error occurred trying to convert the over the wire
372  * attributes to a vattr, then simply mark the attribute cache as
373  * timed out.
374  */
375 void
376 nfs_attrcache(vnode_t *vp, struct nfsfattr *na, hrtime_t t)
377 {
378         rnode_t *rp;
379         struct vattr va;
380 
381         if (!nattr_to_vattr(vp, na, &va)) {
382                 rp = VTOR(vp);
383                 mutex_enter(&rp->r_statelock);
384                 if (rp->r_mtime <= t)
385                         nfs_attrcache_va(vp, &va);
386                 mutex_exit(&rp->r_statelock);
387         } else {
388                 PURGE_ATTRCACHE(vp);
389         }
390 }
391 
392 /*
393  * Set attributes cache for given vnode using fattr3.
394  *
395  * This routine does not do cache validation with the attributes.
396  *
397  * If an error occurred trying to convert the over the wire
398  * attributes to a vattr, then simply mark the attribute cache as
399  * timed out.
400  */
401 void
402 nfs3_attrcache(vnode_t *vp, fattr3 *na, hrtime_t t)
403 {
404         rnode_t *rp;
405         struct vattr va;
406 
407         if (!fattr3_to_vattr(vp, na, &va)) {
408                 rp = VTOR(vp);
409                 mutex_enter(&rp->r_statelock);
410                 if (rp->r_mtime <= t)
411                         nfs_attrcache_va(vp, &va);
412                 mutex_exit(&rp->r_statelock);
413         } else {
414                 PURGE_ATTRCACHE(vp);
415         }
416 }
417 
418 /*
419  * Do a cache check based on attributes returned over the wire.  The
420  * new attributes are cached.
421  *
422  * If an error occurred trying to convert the over the wire attributes
423  * to a vattr, then just return that error.
424  *
425  * As a side affect, the vattr argument is filled in with the converted
426  * attributes.
427  */
428 int
429 nfs_cache_fattr(vnode_t *vp, struct nfsfattr *na, vattr_t *vap, hrtime_t t,
430     cred_t *cr)
431 {
432         int error;
433 
434         error = nattr_to_vattr(vp, na, vap);
435         if (error)
436                 return (error);
437         nfs_attr_cache(vp, vap, t, cr);
438         return (0);
439 }
440 
441 /*
442  * Do a cache check based on attributes returned over the wire.  The
443  * new attributes are cached.
444  *
445  * If an error occurred trying to convert the over the wire attributes
446  * to a vattr, then just return that error.
447  *
448  * As a side affect, the vattr argument is filled in with the converted
449  * attributes.
450  */
451 int
452 nfs3_cache_fattr3(vnode_t *vp, fattr3 *na, vattr_t *vap, hrtime_t t, cred_t *cr)
453 {
454         int error;
455 
456         error = fattr3_to_vattr(vp, na, vap);
457         if (error)
458                 return (error);
459         nfs_attr_cache(vp, vap, t, cr);
460         return (0);
461 }
462 
463 /*
464  * Use the passed in virtual attributes to check to see whether the
465  * data and metadata caches are valid, cache the new attributes, and
466  * then do the cache invalidation if required.
467  *
468  * The cache validation and caching of the new attributes is done
469  * atomically via the use of the mutex, r_statelock.  If required,
470  * the cache invalidation is done atomically w.r.t. the cache
471  * validation and caching of the attributes via the pseudo lock,
472  * r_serial.
473  *
474  * This routine is used to do cache validation and attributes caching
475  * for operations with a single set of post operation attributes.
476  */
477 void
478 nfs_attr_cache(vnode_t *vp, vattr_t *vap, hrtime_t t, cred_t *cr)
479 {
480         rnode_t *rp;
481         int mtime_changed;
482         int ctime_changed;
483         vsecattr_t *vsp;
484         int was_serial;
485 
486         rp = VTOR(vp);
487 
488         mutex_enter(&rp->r_statelock);
489 
490         if (rp->r_serial != curthread) {
491                 klwp_t *lwp = ttolwp(curthread);
492 
493                 was_serial = 0;
494                 if (lwp != NULL)
495                         lwp->lwp_nostop++;
496                 while (rp->r_serial != NULL) {
497                         if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
498                                 mutex_exit(&rp->r_statelock);
499                                 if (lwp != NULL)
500                                         lwp->lwp_nostop--;
501                                 return;
502                         }
503                 }
504                 if (lwp != NULL)
505                         lwp->lwp_nostop--;
506         } else
507                 was_serial = 1;
508 
509         if (rp->r_mtime > t) {
510                 mutex_exit(&rp->r_statelock);
511                 return;
512         }
513 
514         if (!(rp->r_flags & RWRITEATTR)) {
515                 if (!CACHE_VALID(rp, vap->va_mtime, vap->va_size))
516                         mtime_changed = 1;
517                 else
518                         mtime_changed = 0;
519                 if (rp->r_attr.va_ctime.tv_sec != vap->va_ctime.tv_sec ||
520                     rp->r_attr.va_ctime.tv_nsec != vap->va_ctime.tv_nsec)
521                         ctime_changed = 1;
522                 else
523                         ctime_changed = 0;
524         } else if (rp->r_size != vap->va_size &&
525                     (!vn_has_cached_data(vp) ||
526                     (!(rp->r_flags & RDIRTY) && rp->r_count == 0))) {
527                 mtime_changed = 1;
528                 ctime_changed = 0;
529         } else {
530                 mtime_changed = 0;
531                 ctime_changed = 0;
532         }
533 
534         nfs_attrcache_va(vp, vap);
535 
536         if (!mtime_changed && !ctime_changed) {
537                 mutex_exit(&rp->r_statelock);
538                 return;
539         }
540 
541         rp->r_serial = curthread;
542 
543         mutex_exit(&rp->r_statelock);
544 
545         if (mtime_changed)
546                 nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr);
547 
548         if (ctime_changed) {
549                 (void) nfs_access_purge_rp(rp);
550                 if (rp->r_secattr != NULL) {
551                         mutex_enter(&rp->r_statelock);
552                         vsp = rp->r_secattr;
553                         rp->r_secattr = NULL;
554                         mutex_exit(&rp->r_statelock);
555                         if (vsp != NULL)
556                                 nfs_acl_free(vsp);
557                 }
558         }
559 
560         if (!was_serial) {
561                 mutex_enter(&rp->r_statelock);
562                 rp->r_serial = NULL;
563                 cv_broadcast(&rp->r_cv);
564                 mutex_exit(&rp->r_statelock);
565         }
566 }
567 
568 /*
569  * Use the passed in "before" virtual attributes to check to see
570  * whether the data and metadata caches are valid, cache the "after"
571  * new attributes, and then do the cache invalidation if required.
572  *
573  * The cache validation and caching of the new attributes is done
574  * atomically via the use of the mutex, r_statelock.  If required,
575  * the cache invalidation is done atomically w.r.t. the cache
576  * validation and caching of the attributes via the pseudo lock,
577  * r_serial.
578  *
579  * This routine is used to do cache validation and attributes caching
580  * for operations with both pre operation attributes and post operation
581  * attributes.
582  */
583 static void
584 nfs3_attr_cache(vnode_t *vp, vattr_t *bvap, vattr_t *avap, hrtime_t t,
585     cred_t *cr)
586 {
587         rnode_t *rp;
588         int mtime_changed;
589         int ctime_changed;
590         vsecattr_t *vsp;
591         int was_serial;
592 
593         rp = VTOR(vp);
594 
595         mutex_enter(&rp->r_statelock);
596 
597         if (rp->r_serial != curthread) {
598                 klwp_t *lwp = ttolwp(curthread);
599 
600                 was_serial = 0;
601                 if (lwp != NULL)
602                         lwp->lwp_nostop++;
603                 while (rp->r_serial != NULL) {
604                         if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
605                                 mutex_exit(&rp->r_statelock);
606                                 if (lwp != NULL)
607                                         lwp->lwp_nostop--;
608                                 return;
609                         }
610                 }
611                 if (lwp != NULL)
612                         lwp->lwp_nostop--;
613         } else
614                 was_serial = 1;
615 
616         if (rp->r_mtime > t) {
617                 mutex_exit(&rp->r_statelock);
618                 return;
619         }
620 
621         if (!(rp->r_flags & RWRITEATTR)) {
622                 if (!CACHE_VALID(rp, bvap->va_mtime, bvap->va_size))
623                         mtime_changed = 1;
624                 else
625                         mtime_changed = 0;
626                 if (rp->r_attr.va_ctime.tv_sec != bvap->va_ctime.tv_sec ||
627                     rp->r_attr.va_ctime.tv_nsec != bvap->va_ctime.tv_nsec)
628                         ctime_changed = 1;
629                 else
630                         ctime_changed = 0;
631         } else {
632                 mtime_changed = 0;
633                 ctime_changed = 0;
634         }
635 
636         nfs_attrcache_va(vp, avap);
637 
638         if (!mtime_changed && !ctime_changed) {
639                 mutex_exit(&rp->r_statelock);
640                 return;
641         }
642 
643         rp->r_serial = curthread;
644 
645         mutex_exit(&rp->r_statelock);
646 
647         if (mtime_changed)
648                 nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr);
649 
650         if (ctime_changed) {
651                 (void) nfs_access_purge_rp(rp);
652                 if (rp->r_secattr != NULL) {
653                         mutex_enter(&rp->r_statelock);
654                         vsp = rp->r_secattr;
655                         rp->r_secattr = NULL;
656                         mutex_exit(&rp->r_statelock);
657                         if (vsp != NULL)
658                                 nfs_acl_free(vsp);
659                 }
660         }
661 
662         if (!was_serial) {
663                 mutex_enter(&rp->r_statelock);
664                 rp->r_serial = NULL;
665                 cv_broadcast(&rp->r_cv);
666                 mutex_exit(&rp->r_statelock);
667         }
668 }
669 
670 /*
671  * Set attributes cache for given vnode using virtual attributes.
672  *
673  * Set the timeout value on the attribute cache and fill it
674  * with the passed in attributes.
675  *
676  * The caller must be holding r_statelock.
677  */
678 void
679 nfs_attrcache_va(vnode_t *vp, struct vattr *va)
680 {
681         rnode_t *rp;
682         mntinfo_t *mi;
683         hrtime_t delta;
684         hrtime_t now;
685 
686         rp = VTOR(vp);
687 
688         ASSERT(MUTEX_HELD(&rp->r_statelock));
689 
690         now = gethrtime();
691 
692         mi = VTOMI(vp);
693 
694         /*
695          * Delta is the number of nanoseconds that we will
696          * cache the attributes of the file.  It is based on
697          * the number of nanoseconds since the last time that
698          * we detected a change.  The assumption is that files
699          * that changed recently are likely to change again.
700          * There is a minimum and a maximum for regular files
701          * and for directories which is enforced though.
702          *
703          * Using the time since last change was detected
704          * eliminates direct comparison or calculation
705          * using mixed client and server times.  NFS does
706          * not make any assumptions regarding the client
707          * and server clocks being synchronized.
708          */
709         if (va->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec ||
710             va->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec ||
711             va->va_size != rp->r_attr.va_size)
712                 rp->r_mtime = now;
713 
714         if ((mi->mi_flags & MI_NOAC) || (vp->v_flag & VNOCACHE))
715                 delta = 0;
716         else {
717                 delta = now - rp->r_mtime;
718                 if (vp->v_type == VDIR) {
719                         if (delta < mi->mi_acdirmin)
720                                 delta = mi->mi_acdirmin;
721                         else if (delta > mi->mi_acdirmax)
722                                 delta = mi->mi_acdirmax;
723                 } else {
724                         if (delta < mi->mi_acregmin)
725                                 delta = mi->mi_acregmin;
726                         else if (delta > mi->mi_acregmax)
727                                 delta = mi->mi_acregmax;
728                 }
729         }
730         rp->r_attrtime = now + delta;
731         rp->r_attr = *va;
732         /*
733          * Update the size of the file if there is no cached data or if
734          * the cached data is clean and there is no data being written
735          * out.
736          */
737         if (rp->r_size != va->va_size &&
738             (!vn_has_cached_data(vp) ||
739             (!(rp->r_flags & RDIRTY) && rp->r_count == 0)))
740                 rp->r_size = va->va_size;
741         nfs_setswaplike(vp, va);
742         rp->r_flags &= ~RWRITEATTR;
743 }
744 
745 /*
746  * Fill in attribute from the cache.
747  * If valid, then return 0 to indicate that no error occurred,
748  * otherwise return 1 to indicate that an error occurred.
749  */
750 static int
751 nfs_getattr_cache(vnode_t *vp, struct vattr *vap)
752 {
753         rnode_t *rp;
754 
755         rp = VTOR(vp);
756         mutex_enter(&rp->r_statelock);
757         if (ATTRCACHE_VALID(vp)) {
758                 /*
759                  * Cached attributes are valid
760                  */
761                 *vap = rp->r_attr;
762                 mutex_exit(&rp->r_statelock);
763                 return (0);
764         }
765         mutex_exit(&rp->r_statelock);
766         return (1);
767 }
768 
769 /*
770  * Get attributes over-the-wire and update attributes cache
771  * if no error occurred in the over-the-wire operation.
772  * Return 0 if successful, otherwise error.
773  */
774 int
775 nfs_getattr_otw(vnode_t *vp, struct vattr *vap, cred_t *cr)
776 {
777         int error;
778         struct nfsattrstat ns;
779         int douprintf;
780         mntinfo_t *mi;
781         failinfo_t fi;
782         hrtime_t t;
783 
784         mi = VTOMI(vp);
785         fi.vp = vp;
786         fi.fhp = NULL;          /* no need to update, filehandle not copied */
787         fi.copyproc = nfscopyfh;
788         fi.lookupproc = nfslookup;
789         fi.xattrdirproc = acl_getxattrdir2;
790 
791         if (mi->mi_flags & MI_ACL) {
792                 error = acl_getattr2_otw(vp, vap, cr);
793                 if (mi->mi_flags & MI_ACL)
794                         return (error);
795         }
796 
797         douprintf = 1;
798 
799         t = gethrtime();
800 
801         error = rfs2call(mi, RFS_GETATTR,
802                         xdr_fhandle, (caddr_t)VTOFH(vp),
803                         xdr_attrstat, (caddr_t)&ns, cr,
804                         &douprintf, &ns.ns_status, 0, &fi);
805 
806         if (!error) {
807                 error = geterrno(ns.ns_status);
808                 if (!error)
809                         error = nfs_cache_fattr(vp, &ns.ns_attr, vap, t, cr);
810                 else {
811                         PURGE_STALE_FH(error, vp, cr);
812                 }
813         }
814 
815         return (error);
816 }
817 
818 /*
819  * Return either cached ot remote attributes. If get remote attr
820  * use them to check and invalidate caches, then cache the new attributes.
821  */
822 int
823 nfsgetattr(vnode_t *vp, struct vattr *vap, cred_t *cr)
824 {
825         int error;
826         rnode_t *rp;
827 
828         /*
829          * If we've got cached attributes, we're done, otherwise go
830          * to the server to get attributes, which will update the cache
831          * in the process.
832          */
833         error = nfs_getattr_cache(vp, vap);
834         if (error)
835                 error = nfs_getattr_otw(vp, vap, cr);
836 
837         /* Return the client's view of file size */
838         rp = VTOR(vp);
839         mutex_enter(&rp->r_statelock);
840         vap->va_size = rp->r_size;
841         mutex_exit(&rp->r_statelock);
842 
843         return (error);
844 }
845 
846 /*
847  * Get attributes over-the-wire and update attributes cache
848  * if no error occurred in the over-the-wire operation.
849  * Return 0 if successful, otherwise error.
850  */
851 int
852 nfs3_getattr_otw(vnode_t *vp, struct vattr *vap, cred_t *cr)
853 {
854         int error;
855         GETATTR3args args;
856         GETATTR3vres res;
857         int douprintf;
858         failinfo_t fi;
859         hrtime_t t;
860 
861         args.object = *VTOFH3(vp);
862         fi.vp = vp;
863         fi.fhp = (caddr_t)&args.object;
864         fi.copyproc = nfs3copyfh;
865         fi.lookupproc = nfs3lookup;
866         fi.xattrdirproc = acl_getxattrdir3;
867         res.fres.vp = vp;
868         res.fres.vap = vap;
869 
870         douprintf = 1;
871 
872         t = gethrtime();
873 
874         error = rfs3call(VTOMI(vp), NFSPROC3_GETATTR,
875             xdr_nfs_fh3, (caddr_t)&args,
876             xdr_GETATTR3vres, (caddr_t)&res, cr,
877             &douprintf, &res.status, 0, &fi);
878 
879         if (error)
880                 return (error);
881 
882         error = geterrno3(res.status);
883         if (error) {
884                 PURGE_STALE_FH(error, vp, cr);
885                 return (error);
886         }
887 
888         /*
889          * Catch status codes that indicate fattr3 to vattr translation failure
890          */
891         if (res.fres.status)
892                 return (res.fres.status);
893 
894         nfs_attr_cache(vp, vap, t, cr);
895         return (0);
896 }
897 
898 /*
899  * Return either cached or remote attributes. If get remote attr
900  * use them to check and invalidate caches, then cache the new attributes.
901  */
902 int
903 nfs3getattr(vnode_t *vp, struct vattr *vap, cred_t *cr)
904 {
905         int error;
906         rnode_t *rp;
907 
908         /*
909          * If we've got cached attributes, we're done, otherwise go
910          * to the server to get attributes, which will update the cache
911          * in the process.
912          */
913         error = nfs_getattr_cache(vp, vap);
914         if (error)
915                 error = nfs3_getattr_otw(vp, vap, cr);
916 
917         /* Return the client's view of file size */
918         rp = VTOR(vp);
919         mutex_enter(&rp->r_statelock);
920         vap->va_size = rp->r_size;
921         mutex_exit(&rp->r_statelock);
922 
923         return (error);
924 }
925 
926 vtype_t nf_to_vt[] = {
927         VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK
928 };
929 /*
930  * Convert NFS Version 2 over the network attributes to the local
931  * virtual attributes.  The mapping between the UID_NOBODY/GID_NOBODY
932  * network representation and the local representation is done here.
933  * Returns 0 for success, error if failed due to overflow.
934  */
935 int
936 nattr_to_vattr(vnode_t *vp, struct nfsfattr *na, struct vattr *vap)
937 {
938         /* overflow in time attributes? */
939 #ifndef _LP64
940         if (!NFS2_FATTR_TIME_OK(na))
941                 return (EOVERFLOW);
942 #endif
943 
944         if (na->na_type < NFNON || na->na_type > NFSOC)
945                 vap->va_type = VBAD;
946         else
947                 vap->va_type = nf_to_vt[na->na_type];
948         vap->va_mode = na->na_mode;
949         vap->va_uid = (na->na_uid == NFS_UID_NOBODY) ? UID_NOBODY : na->na_uid;
950         vap->va_gid = (na->na_gid == NFS_GID_NOBODY) ? GID_NOBODY : na->na_gid;
951         vap->va_fsid = vp->v_vfsp->vfs_dev;
952         vap->va_nodeid = na->na_nodeid;
953         vap->va_nlink = na->na_nlink;
954         vap->va_size = na->na_size;       /* keep for cache validation */
955         /*
956          * nfs protocol defines times as unsigned so don't extend sign,
957          * unless sysadmin set nfs_allow_preepoch_time.
958          */
959         NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, na->na_atime.tv_sec);
960         vap->va_atime.tv_nsec = (uint32_t)(na->na_atime.tv_usec * 1000);
961         NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, na->na_mtime.tv_sec);
962         vap->va_mtime.tv_nsec = (uint32_t)(na->na_mtime.tv_usec * 1000);
963         NFS_TIME_T_CONVERT(vap->va_ctime.tv_sec, na->na_ctime.tv_sec);
964         vap->va_ctime.tv_nsec = (uint32_t)(na->na_ctime.tv_usec * 1000);
965         /*
966          * Shannon's law - uncompress the received dev_t
967          * if the top half of is zero indicating a response
968          * from an `older style' OS. Except for when it is a
969          * `new style' OS sending the maj device of zero,
970          * in which case the algorithm still works because the
971          * fact that it is a new style server
972          * is hidden by the minor device not being greater
973          * than 255 (a requirement in this case).
974          */
975         if ((na->na_rdev & 0xffff0000) == 0)
976                 vap->va_rdev = nfsv2_expdev(na->na_rdev);
977         else
978                 vap->va_rdev = expldev(na->na_rdev);
979 
980         vap->va_nblocks = na->na_blocks;
981         switch (na->na_type) {
982         case NFBLK:
983                 vap->va_blksize = DEV_BSIZE;
984                 break;
985 
986         case NFCHR:
987                 vap->va_blksize = MAXBSIZE;
988                 break;
989 
990         case NFSOC:
991         default:
992                 vap->va_blksize = na->na_blocksize;
993                 break;
994         }
995         /*
996          * This bit of ugliness is a hack to preserve the
997          * over-the-wire protocols for named-pipe vnodes.
998          * It remaps the special over-the-wire type to the
999          * VFIFO type. (see note in nfs.h)
1000          */
1001         if (NA_ISFIFO(na)) {
1002                 vap->va_type = VFIFO;
1003                 vap->va_mode = (vap->va_mode & ~S_IFMT) | S_IFIFO;
1004                 vap->va_rdev = 0;
1005                 vap->va_blksize = na->na_blocksize;
1006         }
1007         vap->va_seq = 0;
1008         return (0);
1009 }
1010 
1011 /*
1012  * Convert NFS Version 3 over the network attributes to the local
1013  * virtual attributes.  The mapping between the UID_NOBODY/GID_NOBODY
1014  * network representation and the local representation is done here.
1015  */
1016 vtype_t nf3_to_vt[] = {
1017         VBAD, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO
1018 };
1019 
1020 int
1021 fattr3_to_vattr(vnode_t *vp, fattr3 *na, struct vattr *vap)
1022 {
1023 
1024 #ifndef _LP64
1025         /* overflow in time attributes? */
1026         if (!NFS3_FATTR_TIME_OK(na))
1027                 return (EOVERFLOW);
1028 #endif
1029         if (!NFS3_SIZE_OK(na->size))
1030                 /* file too big */
1031                 return (EFBIG);
1032 
1033         vap->va_mask = AT_ALL;
1034 
1035         if (na->type < NF3REG || na->type > NF3FIFO)
1036                 vap->va_type = VBAD;
1037         else
1038                 vap->va_type = nf3_to_vt[na->type];
1039         vap->va_mode = na->mode;
1040         vap->va_uid = (na->uid == NFS_UID_NOBODY) ? UID_NOBODY : (uid_t)na->uid;
1041         vap->va_gid = (na->gid == NFS_GID_NOBODY) ? GID_NOBODY : (gid_t)na->gid;
1042         vap->va_fsid = vp->v_vfsp->vfs_dev;
1043         vap->va_nodeid = na->fileid;
1044         vap->va_nlink = na->nlink;
1045         vap->va_size = na->size;
1046 
1047         /*
1048          * nfs protocol defines times as unsigned so don't extend sign,
1049          * unless sysadmin set nfs_allow_preepoch_time.
1050          */
1051         NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, na->atime.seconds);
1052         vap->va_atime.tv_nsec = (uint32_t)na->atime.nseconds;
1053         NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, na->mtime.seconds);
1054         vap->va_mtime.tv_nsec = (uint32_t)na->mtime.nseconds;
1055         NFS_TIME_T_CONVERT(vap->va_ctime.tv_sec, na->ctime.seconds);
1056         vap->va_ctime.tv_nsec = (uint32_t)na->ctime.nseconds;
1057 
1058         switch (na->type) {
1059         case NF3BLK:
1060                 vap->va_rdev = makedevice(na->rdev.specdata1,
1061                                         na->rdev.specdata2);
1062                 vap->va_blksize = DEV_BSIZE;
1063                 vap->va_nblocks = 0;
1064                 break;
1065         case NF3CHR:
1066                 vap->va_rdev = makedevice(na->rdev.specdata1,
1067                                         na->rdev.specdata2);
1068                 vap->va_blksize = MAXBSIZE;
1069                 vap->va_nblocks = 0;
1070                 break;
1071         case NF3REG:
1072         case NF3DIR:
1073         case NF3LNK:
1074                 vap->va_rdev = 0;
1075                 vap->va_blksize = MAXBSIZE;
1076                 vap->va_nblocks = (u_longlong_t)
1077                     ((na->used + (size3)DEV_BSIZE - (size3)1) /
1078                     (size3)DEV_BSIZE);
1079                 break;
1080         case NF3SOCK:
1081         case NF3FIFO:
1082         default:
1083                 vap->va_rdev = 0;
1084                 vap->va_blksize = MAXBSIZE;
1085                 vap->va_nblocks = 0;
1086                 break;
1087         }
1088         vap->va_seq = 0;
1089         return (0);
1090 }
1091 
1092 /*
1093  * Asynchronous I/O parameters.  nfs_async_threads is the high-water mark
1094  * for the demand-based allocation of async threads per-mount.  The
1095  * nfs_async_timeout is the amount of time a thread will live after it
1096  * becomes idle, unless new I/O requests are received before the thread
1097  * dies.  See nfs_async_putpage and nfs_async_start.
1098  */
1099 
1100 int nfs_async_timeout = -1;     /* uninitialized */
1101 
1102 static void     nfs_async_start(struct vfs *);
1103 
1104 static void
1105 free_async_args(struct nfs_async_reqs *args)
1106 {
1107         rnode_t *rp;
1108 
1109         if (args->a_io != NFS_INACTIVE) {
1110                 rp = VTOR(args->a_vp);
1111                 mutex_enter(&rp->r_statelock);
1112                 rp->r_count--;
1113                 if (args->a_io == NFS_PUTAPAGE ||
1114                     args->a_io == NFS_PAGEIO)
1115                         rp->r_awcount--;
1116                 cv_broadcast(&rp->r_cv);
1117                 mutex_exit(&rp->r_statelock);
1118                 VN_RELE(args->a_vp);
1119         }
1120         crfree(args->a_cred);
1121         kmem_free(args, sizeof (*args));
1122 }
1123 
1124 /*
1125  * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and
1126  * pageout(), running in the global zone, have legitimate reasons to do
1127  * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts.  We avoid the problem by
1128  * use of a a per-mount "asynchronous requests manager thread" which is
1129  * signaled by the various asynchronous work routines when there is
1130  * asynchronous work to be done.  It is responsible for creating new
1131  * worker threads if necessary, and notifying existing worker threads
1132  * that there is work to be done.
1133  *
1134  * In other words, it will "take the specifications from the customers and
1135  * give them to the engineers."
1136  *
1137  * Worker threads die off of their own accord if they are no longer
1138  * needed.
1139  *
1140  * This thread is killed when the zone is going away or the filesystem
1141  * is being unmounted.
1142  */
1143 void
1144 nfs_async_manager(vfs_t *vfsp)
1145 {
1146         callb_cpr_t cprinfo;
1147         mntinfo_t *mi;
1148         uint_t max_threads;
1149 
1150         mi = VFTOMI(vfsp);
1151 
1152         CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
1153                     "nfs_async_manager");
1154 
1155         mutex_enter(&mi->mi_async_lock);
1156         /*
1157          * We want to stash the max number of threads that this mount was
1158          * allowed so we can use it later when the variable is set to zero as
1159          * part of the zone/mount going away.
1160          *
1161          * We want to be able to create at least one thread to handle
1162          * asyncrhonous inactive calls.
1163          */
1164         max_threads = MAX(mi->mi_max_threads, 1);
1165         mutex_enter(&mi->mi_lock);
1166         /*
1167          * We don't want to wait for mi_max_threads to go to zero, since that
1168          * happens as part of a failed unmount, but this thread should only
1169          * exit when the mount/zone is really going away.
1170          *
1171          * Once MI_ASYNC_MGR_STOP is set, no more async operations will be
1172          * attempted: the various _async_*() functions know to do things
1173          * inline if mi_max_threads == 0.  Henceforth we just drain out the
1174          * outstanding requests.
1175          *
1176          * Note that we still create zthreads even if we notice the zone is
1177          * shutting down (MI_ASYNC_MGR_STOP is set); this may cause the zone
1178          * shutdown sequence to take slightly longer in some cases, but
1179          * doesn't violate the protocol, as all threads will exit as soon as
1180          * they're done processing the remaining requests.
1181          */
1182         while (!(mi->mi_flags & MI_ASYNC_MGR_STOP) ||
1183             mi->mi_async_req_count > 0) {
1184                 mutex_exit(&mi->mi_lock);
1185                 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1186                 cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock);
1187                 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1188                 while (mi->mi_async_req_count > 0) {
1189                         /*
1190                          * Paranoia: If the mount started out having
1191                          * (mi->mi_max_threads == 0), and the value was
1192                          * later changed (via a debugger or somesuch),
1193                          * we could be confused since we will think we
1194                          * can't create any threads, and the calling
1195                          * code (which looks at the current value of
1196                          * mi->mi_max_threads, now non-zero) thinks we
1197                          * can.
1198                          *
1199                          * So, because we're paranoid, we create threads
1200                          * up to the maximum of the original and the
1201                          * current value. This means that future
1202                          * (debugger-induced) lowerings of
1203                          * mi->mi_max_threads are ignored for our
1204                          * purposes, but who told them they could change
1205                          * random values on a live kernel anyhow?
1206                          */
1207                         if (mi->mi_threads <
1208                             MAX(mi->mi_max_threads, max_threads)) {
1209                                 mi->mi_threads++;
1210                                 mutex_exit(&mi->mi_async_lock);
1211                                 VFS_HOLD(vfsp); /* hold for new thread */
1212                                 (void) zthread_create(NULL, 0, nfs_async_start,
1213                                     vfsp, 0, minclsyspri);
1214                                 mutex_enter(&mi->mi_async_lock);
1215                         }
1216                         cv_signal(&mi->mi_async_work_cv);
1217                         ASSERT(mi->mi_async_req_count != 0);
1218                         mi->mi_async_req_count--;
1219                 }
1220                 mutex_enter(&mi->mi_lock);
1221         }
1222         mutex_exit(&mi->mi_lock);
1223         /*
1224          * Let everyone know we're done.
1225          */
1226         mi->mi_manager_thread = NULL;
1227         cv_broadcast(&mi->mi_async_cv);
1228 
1229         /*
1230          * There is no explicit call to mutex_exit(&mi->mi_async_lock)
1231          * since CALLB_CPR_EXIT is actually responsible for releasing
1232          * 'mi_async_lock'.
1233          */
1234         CALLB_CPR_EXIT(&cprinfo);
1235         VFS_RELE(vfsp); /* release thread's hold */
1236         zthread_exit();
1237 }
1238 
1239 /*
1240  * Signal (and wait for) the async manager thread to clean up and go away.
1241  */
1242 void
1243 nfs_async_manager_stop(vfs_t *vfsp)
1244 {
1245         mntinfo_t *mi = VFTOMI(vfsp);
1246 
1247         mutex_enter(&mi->mi_async_lock);
1248         mutex_enter(&mi->mi_lock);
1249         mi->mi_flags |= MI_ASYNC_MGR_STOP;
1250         mutex_exit(&mi->mi_lock);
1251         cv_broadcast(&mi->mi_async_reqs_cv);
1252         while (mi->mi_manager_thread != NULL)
1253                 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1254         mutex_exit(&mi->mi_async_lock);
1255 }
1256 
1257 int
1258 nfs_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr,
1259         struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *,
1260         u_offset_t, caddr_t, struct seg *, cred_t *))
1261 {
1262         rnode_t *rp;
1263         mntinfo_t *mi;
1264         struct nfs_async_reqs *args;
1265 
1266         rp = VTOR(vp);
1267         ASSERT(rp->r_freef == NULL);
1268 
1269         mi = VTOMI(vp);
1270 
1271         /*
1272          * If addr falls in a different segment, don't bother doing readahead.
1273          */
1274         if (addr >= seg->s_base + seg->s_size)
1275                 return (-1);
1276 
1277         /*
1278          * If we can't allocate a request structure, punt on the readahead.
1279          */
1280         if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1281                 return (-1);
1282 
1283         /*
1284          * If a lock operation is pending, don't initiate any new
1285          * readaheads.  Otherwise, bump r_count to indicate the new
1286          * asynchronous I/O.
1287          */
1288         if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) {
1289                 kmem_free(args, sizeof (*args));
1290                 return (-1);
1291         }
1292         mutex_enter(&rp->r_statelock);
1293         rp->r_count++;
1294         mutex_exit(&rp->r_statelock);
1295         nfs_rw_exit(&rp->r_lkserlock);
1296 
1297         args->a_next = NULL;
1298 #ifdef DEBUG
1299         args->a_queuer = curthread;
1300 #endif
1301         VN_HOLD(vp);
1302         args->a_vp = vp;
1303         ASSERT(cr != NULL);
1304         crhold(cr);
1305         args->a_cred = cr;
1306         args->a_io = NFS_READ_AHEAD;
1307         args->a_nfs_readahead = readahead;
1308         args->a_nfs_blkoff = blkoff;
1309         args->a_nfs_seg = seg;
1310         args->a_nfs_addr = addr;
1311 
1312         mutex_enter(&mi->mi_async_lock);
1313 
1314         /*
1315          * If asyncio has been disabled, don't bother readahead.
1316          */
1317         if (mi->mi_max_threads == 0) {
1318                 mutex_exit(&mi->mi_async_lock);
1319                 goto noasync;
1320         }
1321 
1322         /*
1323          * Link request structure into the async list and
1324          * wakeup async thread to do the i/o.
1325          */
1326         if (mi->mi_async_reqs[NFS_READ_AHEAD] == NULL) {
1327                 mi->mi_async_reqs[NFS_READ_AHEAD] = args;
1328                 mi->mi_async_tail[NFS_READ_AHEAD] = args;
1329         } else {
1330                 mi->mi_async_tail[NFS_READ_AHEAD]->a_next = args;
1331                 mi->mi_async_tail[NFS_READ_AHEAD] = args;
1332         }
1333 
1334         if (mi->mi_io_kstats) {
1335                 mutex_enter(&mi->mi_lock);
1336                 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1337                 mutex_exit(&mi->mi_lock);
1338         }
1339 
1340         mi->mi_async_req_count++;
1341         ASSERT(mi->mi_async_req_count != 0);
1342         cv_signal(&mi->mi_async_reqs_cv);
1343         mutex_exit(&mi->mi_async_lock);
1344         return (0);
1345 
1346 noasync:
1347         mutex_enter(&rp->r_statelock);
1348         rp->r_count--;
1349         cv_broadcast(&rp->r_cv);
1350         mutex_exit(&rp->r_statelock);
1351         VN_RELE(vp);
1352         crfree(cr);
1353         kmem_free(args, sizeof (*args));
1354         return (-1);
1355 }
1356 
1357 int
1358 nfs_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
1359         int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *,
1360         u_offset_t, size_t, int, cred_t *))
1361 {
1362         rnode_t *rp;
1363         mntinfo_t *mi;
1364         struct nfs_async_reqs *args;
1365 
1366         ASSERT(flags & B_ASYNC);
1367         ASSERT(vp->v_vfsp != NULL);
1368 
1369         rp = VTOR(vp);
1370         ASSERT(rp->r_count > 0);
1371 
1372         mi = VTOMI(vp);
1373 
1374         /*
1375          * If we can't allocate a request structure, do the putpage
1376          * operation synchronously in this thread's context.
1377          */
1378         if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1379                 goto noasync;
1380 
1381         args->a_next = NULL;
1382 #ifdef DEBUG
1383         args->a_queuer = curthread;
1384 #endif
1385         VN_HOLD(vp);
1386         args->a_vp = vp;
1387         ASSERT(cr != NULL);
1388         crhold(cr);
1389         args->a_cred = cr;
1390         args->a_io = NFS_PUTAPAGE;
1391         args->a_nfs_putapage = putapage;
1392         args->a_nfs_pp = pp;
1393         args->a_nfs_off = off;
1394         args->a_nfs_len = (uint_t)len;
1395         args->a_nfs_flags = flags;
1396 
1397         mutex_enter(&mi->mi_async_lock);
1398 
1399         /*
1400          * If asyncio has been disabled, then make a synchronous request.
1401          * This check is done a second time in case async io was diabled
1402          * while this thread was blocked waiting for memory pressure to
1403          * reduce or for the queue to drain.
1404          */
1405         if (mi->mi_max_threads == 0) {
1406                 mutex_exit(&mi->mi_async_lock);
1407                 goto noasync;
1408         }
1409 
1410         /*
1411          * Link request structure into the async list and
1412          * wakeup async thread to do the i/o.
1413          */
1414         if (mi->mi_async_reqs[NFS_PUTAPAGE] == NULL) {
1415                 mi->mi_async_reqs[NFS_PUTAPAGE] = args;
1416                 mi->mi_async_tail[NFS_PUTAPAGE] = args;
1417         } else {
1418                 mi->mi_async_tail[NFS_PUTAPAGE]->a_next = args;
1419                 mi->mi_async_tail[NFS_PUTAPAGE] = args;
1420         }
1421 
1422         mutex_enter(&rp->r_statelock);
1423         rp->r_count++;
1424         rp->r_awcount++;
1425         mutex_exit(&rp->r_statelock);
1426 
1427         if (mi->mi_io_kstats) {
1428                 mutex_enter(&mi->mi_lock);
1429                 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1430                 mutex_exit(&mi->mi_lock);
1431         }
1432 
1433         mi->mi_async_req_count++;
1434         ASSERT(mi->mi_async_req_count != 0);
1435         cv_signal(&mi->mi_async_reqs_cv);
1436         mutex_exit(&mi->mi_async_lock);
1437         return (0);
1438 
1439 noasync:
1440         if (args != NULL) {
1441                 VN_RELE(vp);
1442                 crfree(cr);
1443                 kmem_free(args, sizeof (*args));
1444         }
1445 
1446         if (curproc == proc_pageout || curproc == proc_fsflush) {
1447                 /*
1448                  * If we get here in the context of the pageout/fsflush,
1449                  * we refuse to do a sync write, because this may hang
1450                  * pageout (and the machine). In this case, we just
1451                  * re-mark the page as dirty and punt on the page.
1452                  *
1453                  * Make sure B_FORCE isn't set.  We can re-mark the
1454                  * pages as dirty and unlock the pages in one swoop by
1455                  * passing in B_ERROR to pvn_write_done().  However,
1456                  * we should make sure B_FORCE isn't set - we don't
1457                  * want the page tossed before it gets written out.
1458                  */
1459                 if (flags & B_FORCE)
1460                         flags &= ~(B_INVAL | B_FORCE);
1461                 pvn_write_done(pp, flags | B_ERROR);
1462                 return (0);
1463         }
1464         if (nfs_zone() != mi->mi_zone) {
1465                 /*
1466                  * So this was a cross-zone sync putpage.  We pass in B_ERROR
1467                  * to pvn_write_done() to re-mark the pages as dirty and unlock
1468                  * them.
1469                  *
1470                  * We don't want to clear B_FORCE here as the caller presumably
1471                  * knows what they're doing if they set it.
1472                  */
1473                 pvn_write_done(pp, flags | B_ERROR);
1474                 return (EPERM);
1475         }
1476         return ((*putapage)(vp, pp, off, len, flags, cr));
1477 }
1478 
1479 int
1480 nfs_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
1481         int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t,
1482         size_t, int, cred_t *))
1483 {
1484         rnode_t *rp;
1485         mntinfo_t *mi;
1486         struct nfs_async_reqs *args;
1487 
1488         ASSERT(flags & B_ASYNC);
1489         ASSERT(vp->v_vfsp != NULL);
1490 
1491         rp = VTOR(vp);
1492         ASSERT(rp->r_count > 0);
1493 
1494         mi = VTOMI(vp);
1495 
1496         /*
1497          * If we can't allocate a request structure, do the pageio
1498          * request synchronously in this thread's context.
1499          */
1500         if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1501                 goto noasync;
1502 
1503         args->a_next = NULL;
1504 #ifdef DEBUG
1505         args->a_queuer = curthread;
1506 #endif
1507         VN_HOLD(vp);
1508         args->a_vp = vp;
1509         ASSERT(cr != NULL);
1510         crhold(cr);
1511         args->a_cred = cr;
1512         args->a_io = NFS_PAGEIO;
1513         args->a_nfs_pageio = pageio;
1514         args->a_nfs_pp = pp;
1515         args->a_nfs_off = io_off;
1516         args->a_nfs_len = (uint_t)io_len;
1517         args->a_nfs_flags = flags;
1518 
1519         mutex_enter(&mi->mi_async_lock);
1520 
1521         /*
1522          * If asyncio has been disabled, then make a synchronous request.
1523          * This check is done a second time in case async io was diabled
1524          * while this thread was blocked waiting for memory pressure to
1525          * reduce or for the queue to drain.
1526          */
1527         if (mi->mi_max_threads == 0) {
1528                 mutex_exit(&mi->mi_async_lock);
1529                 goto noasync;
1530         }
1531 
1532         /*
1533          * Link request structure into the async list and
1534          * wakeup async thread to do the i/o.
1535          */
1536         if (mi->mi_async_reqs[NFS_PAGEIO] == NULL) {
1537                 mi->mi_async_reqs[NFS_PAGEIO] = args;
1538                 mi->mi_async_tail[NFS_PAGEIO] = args;
1539         } else {
1540                 mi->mi_async_tail[NFS_PAGEIO]->a_next = args;
1541                 mi->mi_async_tail[NFS_PAGEIO] = args;
1542         }
1543 
1544         mutex_enter(&rp->r_statelock);
1545         rp->r_count++;
1546         rp->r_awcount++;
1547         mutex_exit(&rp->r_statelock);
1548 
1549         if (mi->mi_io_kstats) {
1550                 mutex_enter(&mi->mi_lock);
1551                 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1552                 mutex_exit(&mi->mi_lock);
1553         }
1554 
1555         mi->mi_async_req_count++;
1556         ASSERT(mi->mi_async_req_count != 0);
1557         cv_signal(&mi->mi_async_reqs_cv);
1558         mutex_exit(&mi->mi_async_lock);
1559         return (0);
1560 
1561 noasync:
1562         if (args != NULL) {
1563                 VN_RELE(vp);
1564                 crfree(cr);
1565                 kmem_free(args, sizeof (*args));
1566         }
1567 
1568         /*
1569          * If we can't do it ASYNC, for reads we do nothing (but cleanup
1570          * the page list), for writes we do it synchronously, except for
1571          * proc_pageout/proc_fsflush as described below.
1572          */
1573         if (flags & B_READ) {
1574                 pvn_read_done(pp, flags | B_ERROR);
1575                 return (0);
1576         }
1577 
1578         if (curproc == proc_pageout || curproc == proc_fsflush) {
1579                 /*
1580                  * If we get here in the context of the pageout/fsflush,
1581                  * we refuse to do a sync write, because this may hang
1582                  * pageout/fsflush (and the machine). In this case, we just
1583                  * re-mark the page as dirty and punt on the page.
1584                  *
1585                  * Make sure B_FORCE isn't set.  We can re-mark the
1586                  * pages as dirty and unlock the pages in one swoop by
1587                  * passing in B_ERROR to pvn_write_done().  However,
1588                  * we should make sure B_FORCE isn't set - we don't
1589                  * want the page tossed before it gets written out.
1590                  */
1591                 if (flags & B_FORCE)
1592                         flags &= ~(B_INVAL | B_FORCE);
1593                 pvn_write_done(pp, flags | B_ERROR);
1594                 return (0);
1595         }
1596 
1597         if (nfs_zone() != mi->mi_zone) {
1598                 /*
1599                  * So this was a cross-zone sync pageio.  We pass in B_ERROR
1600                  * to pvn_write_done() to re-mark the pages as dirty and unlock
1601                  * them.
1602                  *
1603                  * We don't want to clear B_FORCE here as the caller presumably
1604                  * knows what they're doing if they set it.
1605                  */
1606                 pvn_write_done(pp, flags | B_ERROR);
1607                 return (EPERM);
1608         }
1609         return ((*pageio)(vp, pp, io_off, io_len, flags, cr));
1610 }
1611 
1612 void
1613 nfs_async_readdir(vnode_t *vp, rddir_cache *rdc, cred_t *cr,
1614         int (*readdir)(vnode_t *, rddir_cache *, cred_t *))
1615 {
1616         rnode_t *rp;
1617         mntinfo_t *mi;
1618         struct nfs_async_reqs *args;
1619 
1620         rp = VTOR(vp);
1621         ASSERT(rp->r_freef == NULL);
1622 
1623         mi = VTOMI(vp);
1624 
1625         /*
1626          * If we can't allocate a request structure, do the readdir
1627          * operation synchronously in this thread's context.
1628          */
1629         if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1630                 goto noasync;
1631 
1632         args->a_next = NULL;
1633 #ifdef DEBUG
1634         args->a_queuer = curthread;
1635 #endif
1636         VN_HOLD(vp);
1637         args->a_vp = vp;
1638         ASSERT(cr != NULL);
1639         crhold(cr);
1640         args->a_cred = cr;
1641         args->a_io = NFS_READDIR;
1642         args->a_nfs_readdir = readdir;
1643         args->a_nfs_rdc = rdc;
1644 
1645         mutex_enter(&mi->mi_async_lock);
1646 
1647         /*
1648          * If asyncio has been disabled, then make a synchronous request.
1649          */
1650         if (mi->mi_max_threads == 0) {
1651                 mutex_exit(&mi->mi_async_lock);
1652                 goto noasync;
1653         }
1654 
1655         /*
1656          * Link request structure into the async list and
1657          * wakeup async thread to do the i/o.
1658          */
1659         if (mi->mi_async_reqs[NFS_READDIR] == NULL) {
1660                 mi->mi_async_reqs[NFS_READDIR] = args;
1661                 mi->mi_async_tail[NFS_READDIR] = args;
1662         } else {
1663                 mi->mi_async_tail[NFS_READDIR]->a_next = args;
1664                 mi->mi_async_tail[NFS_READDIR] = args;
1665         }
1666 
1667         mutex_enter(&rp->r_statelock);
1668         rp->r_count++;
1669         mutex_exit(&rp->r_statelock);
1670 
1671         if (mi->mi_io_kstats) {
1672                 mutex_enter(&mi->mi_lock);
1673                 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1674                 mutex_exit(&mi->mi_lock);
1675         }
1676 
1677         mi->mi_async_req_count++;
1678         ASSERT(mi->mi_async_req_count != 0);
1679         cv_signal(&mi->mi_async_reqs_cv);
1680         mutex_exit(&mi->mi_async_lock);
1681         return;
1682 
1683 noasync:
1684         if (args != NULL) {
1685                 VN_RELE(vp);
1686                 crfree(cr);
1687                 kmem_free(args, sizeof (*args));
1688         }
1689 
1690         rdc->entries = NULL;
1691         mutex_enter(&rp->r_statelock);
1692         ASSERT(rdc->flags & RDDIR);
1693         rdc->flags &= ~RDDIR;
1694         rdc->flags |= RDDIRREQ;
1695         /*
1696          * Check the flag to see if RDDIRWAIT is set. If RDDIRWAIT
1697          * is set, wakeup the thread sleeping in cv_wait_sig().
1698          * The woken up thread will reset the flag to RDDIR and will
1699          * continue with the readdir opeartion.
1700          */
1701         if (rdc->flags & RDDIRWAIT) {
1702                 rdc->flags &= ~RDDIRWAIT;
1703                 cv_broadcast(&rdc->cv);
1704         }
1705         mutex_exit(&rp->r_statelock);
1706         rddir_cache_rele(rdc);
1707 }
1708 
1709 void
1710 nfs_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
1711         cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3,
1712         cred_t *))
1713 {
1714         rnode_t *rp;
1715         mntinfo_t *mi;
1716         struct nfs_async_reqs *args;
1717         page_t *pp;
1718 
1719         rp = VTOR(vp);
1720         mi = VTOMI(vp);
1721 
1722         /*
1723          * If we can't allocate a request structure, do the commit
1724          * operation synchronously in this thread's context.
1725          */
1726         if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1727                 goto noasync;
1728 
1729         args->a_next = NULL;
1730 #ifdef DEBUG
1731         args->a_queuer = curthread;
1732 #endif
1733         VN_HOLD(vp);
1734         args->a_vp = vp;
1735         ASSERT(cr != NULL);
1736         crhold(cr);
1737         args->a_cred = cr;
1738         args->a_io = NFS_COMMIT;
1739         args->a_nfs_commit = commit;
1740         args->a_nfs_plist = plist;
1741         args->a_nfs_offset = offset;
1742         args->a_nfs_count = count;
1743 
1744         mutex_enter(&mi->mi_async_lock);
1745 
1746         /*
1747          * If asyncio has been disabled, then make a synchronous request.
1748          * This check is done a second time in case async io was diabled
1749          * while this thread was blocked waiting for memory pressure to
1750          * reduce or for the queue to drain.
1751          */
1752         if (mi->mi_max_threads == 0) {
1753                 mutex_exit(&mi->mi_async_lock);
1754                 goto noasync;
1755         }
1756 
1757         /*
1758          * Link request structure into the async list and
1759          * wakeup async thread to do the i/o.
1760          */
1761         if (mi->mi_async_reqs[NFS_COMMIT] == NULL) {
1762                 mi->mi_async_reqs[NFS_COMMIT] = args;
1763                 mi->mi_async_tail[NFS_COMMIT] = args;
1764         } else {
1765                 mi->mi_async_tail[NFS_COMMIT]->a_next = args;
1766                 mi->mi_async_tail[NFS_COMMIT] = args;
1767         }
1768 
1769         mutex_enter(&rp->r_statelock);
1770         rp->r_count++;
1771         mutex_exit(&rp->r_statelock);
1772 
1773         if (mi->mi_io_kstats) {
1774                 mutex_enter(&mi->mi_lock);
1775                 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1776                 mutex_exit(&mi->mi_lock);
1777         }
1778 
1779         mi->mi_async_req_count++;
1780         ASSERT(mi->mi_async_req_count != 0);
1781         cv_signal(&mi->mi_async_reqs_cv);
1782         mutex_exit(&mi->mi_async_lock);
1783         return;
1784 
1785 noasync:
1786         if (args != NULL) {
1787                 VN_RELE(vp);
1788                 crfree(cr);
1789                 kmem_free(args, sizeof (*args));
1790         }
1791 
1792         if (curproc == proc_pageout || curproc == proc_fsflush ||
1793             nfs_zone() != mi->mi_zone) {
1794                 while (plist != NULL) {
1795                         pp = plist;
1796                         page_sub(&plist, pp);
1797                         pp->p_fsdata = C_COMMIT;
1798                         page_unlock(pp);
1799                 }
1800                 return;
1801         }
1802         (*commit)(vp, plist, offset, count, cr);
1803 }
1804 
1805 void
1806 nfs_async_inactive(vnode_t *vp, cred_t *cr,
1807     void (*inactive)(vnode_t *, cred_t *))
1808 {
1809         mntinfo_t *mi;
1810         struct nfs_async_reqs *args;
1811 
1812         mi = VTOMI(vp);
1813 
1814         args = kmem_alloc(sizeof (*args), KM_SLEEP);
1815         args->a_next = NULL;
1816 #ifdef DEBUG
1817         args->a_queuer = curthread;
1818 #endif
1819         args->a_vp = vp;
1820         ASSERT(cr != NULL);
1821         crhold(cr);
1822         args->a_cred = cr;
1823         args->a_io = NFS_INACTIVE;
1824         args->a_nfs_inactive = inactive;
1825 
1826         /*
1827          * Note that we don't check mi->mi_max_threads here, since we
1828          * *need* to get rid of this vnode regardless of whether someone
1829          * set nfs3_max_threads/nfs_max_threads to zero in /etc/system.
1830          *
1831          * The manager thread knows about this and is willing to create
1832          * at least one thread to accomodate us.
1833          */
1834         mutex_enter(&mi->mi_async_lock);
1835         if (mi->mi_manager_thread == NULL) {
1836                 rnode_t *rp = VTOR(vp);
1837 
1838                 mutex_exit(&mi->mi_async_lock);
1839                 crfree(cr);     /* drop our reference */
1840                 kmem_free(args, sizeof (*args));
1841                 /*
1842                  * We can't do an over-the-wire call since we're in the wrong
1843                  * zone, so we need to clean up state as best we can and then
1844                  * throw away the vnode.
1845                  */
1846                 mutex_enter(&rp->r_statelock);
1847                 if (rp->r_unldvp != NULL) {
1848                         vnode_t *unldvp;
1849                         char *unlname;
1850                         cred_t *unlcred;
1851 
1852                         unldvp = rp->r_unldvp;
1853                         rp->r_unldvp = NULL;
1854                         unlname = rp->r_unlname;
1855                         rp->r_unlname = NULL;
1856                         unlcred = rp->r_unlcred;
1857                         rp->r_unlcred = NULL;
1858                         mutex_exit(&rp->r_statelock);
1859 
1860                         VN_RELE(unldvp);
1861                         kmem_free(unlname, MAXNAMELEN);
1862                         crfree(unlcred);
1863                 } else {
1864                         mutex_exit(&rp->r_statelock);
1865                 }
1866                 /*
1867                  * No need to explicitly throw away any cached pages.  The
1868                  * eventual rinactive() will attempt a synchronous
1869                  * VOP_PUTPAGE() which will immediately fail since the request
1870                  * is coming from the wrong zone, and then will proceed to call
1871                  * nfs_invalidate_pages() which will clean things up for us.
1872                  */
1873                 rp_addfree(VTOR(vp), cr);
1874                 return;
1875         }
1876 
1877         if (mi->mi_async_reqs[NFS_INACTIVE] == NULL) {
1878                 mi->mi_async_reqs[NFS_INACTIVE] = args;
1879         } else {
1880                 mi->mi_async_tail[NFS_INACTIVE]->a_next = args;
1881         }
1882         mi->mi_async_tail[NFS_INACTIVE] = args;
1883         /*
1884          * Don't increment r_count, since we're trying to get rid of the vnode.
1885          */
1886 
1887         mi->mi_async_req_count++;
1888         ASSERT(mi->mi_async_req_count != 0);
1889         cv_signal(&mi->mi_async_reqs_cv);
1890         mutex_exit(&mi->mi_async_lock);
1891 }
1892 
1893 /*
1894  * The async queues for each mounted file system are arranged as a
1895  * set of queues, one for each async i/o type.  Requests are taken
1896  * from the queues in a round-robin fashion.  A number of consecutive
1897  * requests are taken from each queue before moving on to the next
1898  * queue.  This functionality may allow the NFS Version 2 server to do
1899  * write clustering, even if the client is mixing writes and reads
1900  * because it will take multiple write requests from the queue
1901  * before processing any of the other async i/o types.
1902  *
1903  * XXX The nfs_async_start thread is unsafe in the light of the present
1904  * model defined by cpr to suspend the system. Specifically over the
1905  * wire calls are cpr-unsafe. The thread should be reevaluated in
1906  * case of future updates to the cpr model.
1907  */
1908 static void
1909 nfs_async_start(struct vfs *vfsp)
1910 {
1911         struct nfs_async_reqs *args;
1912         mntinfo_t *mi = VFTOMI(vfsp);
1913         clock_t time_left = 1;
1914         callb_cpr_t cprinfo;
1915         int i;
1916 
1917         /*
1918          * Dynamic initialization of nfs_async_timeout to allow nfs to be
1919          * built in an implementation independent manner.
1920          */
1921         if (nfs_async_timeout == -1)
1922                 nfs_async_timeout = NFS_ASYNC_TIMEOUT;
1923 
1924         CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas");
1925 
1926         mutex_enter(&mi->mi_async_lock);
1927         for (;;) {
1928                 /*
1929                  * Find the next queue containing an entry.  We start
1930                  * at the current queue pointer and then round robin
1931                  * through all of them until we either find a non-empty
1932                  * queue or have looked through all of them.
1933                  */
1934                 for (i = 0; i < NFS_ASYNC_TYPES; i++) {
1935                         args = *mi->mi_async_curr;
1936                         if (args != NULL)
1937                                 break;
1938                         mi->mi_async_curr++;
1939                         if (mi->mi_async_curr ==
1940                             &mi->mi_async_reqs[NFS_ASYNC_TYPES])
1941                                 mi->mi_async_curr = &mi->mi_async_reqs[0];
1942                 }
1943                 /*
1944                  * If we didn't find a entry, then block until woken up
1945                  * again and then look through the queues again.
1946                  */
1947                 if (args == NULL) {
1948                         /*
1949                          * Exiting is considered to be safe for CPR as well
1950                          */
1951                         CALLB_CPR_SAFE_BEGIN(&cprinfo);
1952 
1953                         /*
1954                          * Wakeup thread waiting to unmount the file
1955                          * system only if all async threads are inactive.
1956                          *
1957                          * If we've timed-out and there's nothing to do,
1958                          * then get rid of this thread.
1959                          */
1960                         if (mi->mi_max_threads == 0 || time_left <= 0) {
1961                                 if (--mi->mi_threads == 0)
1962                                         cv_signal(&mi->mi_async_cv);
1963                                 CALLB_CPR_EXIT(&cprinfo);
1964                                 VFS_RELE(vfsp); /* release thread's hold */
1965                                 zthread_exit();
1966                                 /* NOTREACHED */
1967                         }
1968                         time_left = cv_timedwait(&mi->mi_async_work_cv,
1969                             &mi->mi_async_lock, nfs_async_timeout + lbolt);
1970 
1971                         CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1972 
1973                         continue;
1974                 }
1975                 time_left = 1;
1976 
1977                 /*
1978                  * Remove the request from the async queue and then
1979                  * update the current async request queue pointer.  If
1980                  * the current queue is empty or we have removed enough
1981                  * consecutive entries from it, then reset the counter
1982                  * for this queue and then move the current pointer to
1983                  * the next queue.
1984                  */
1985                 *mi->mi_async_curr = args->a_next;
1986                 if (*mi->mi_async_curr == NULL ||
1987                     --mi->mi_async_clusters[args->a_io] == 0) {
1988                         mi->mi_async_clusters[args->a_io] =
1989                                                 mi->mi_async_init_clusters;
1990                         mi->mi_async_curr++;
1991                         if (mi->mi_async_curr ==
1992                             &mi->mi_async_reqs[NFS_ASYNC_TYPES])
1993                                 mi->mi_async_curr = &mi->mi_async_reqs[0];
1994                 }
1995 
1996                 if (args->a_io != NFS_INACTIVE && mi->mi_io_kstats) {
1997                         mutex_enter(&mi->mi_lock);
1998                         kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
1999                         mutex_exit(&mi->mi_lock);
2000                 }
2001 
2002                 mutex_exit(&mi->mi_async_lock);
2003 
2004                 /*
2005                  * Obtain arguments from the async request structure.
2006                  */
2007                 if (args->a_io == NFS_READ_AHEAD && mi->mi_max_threads > 0) {
2008                         (*args->a_nfs_readahead)(args->a_vp, args->a_nfs_blkoff,
2009                                         args->a_nfs_addr, args->a_nfs_seg,
2010                                         args->a_cred);
2011                 } else if (args->a_io == NFS_PUTAPAGE) {
2012                         (void) (*args->a_nfs_putapage)(args->a_vp,
2013                                         args->a_nfs_pp, args->a_nfs_off,
2014                                         args->a_nfs_len, args->a_nfs_flags,
2015                                         args->a_cred);
2016                 } else if (args->a_io == NFS_PAGEIO) {
2017                         (void) (*args->a_nfs_pageio)(args->a_vp,
2018                                         args->a_nfs_pp, args->a_nfs_off,
2019                                         args->a_nfs_len, args->a_nfs_flags,
2020                                         args->a_cred);
2021                 } else if (args->a_io == NFS_READDIR) {
2022                         (void) ((*args->a_nfs_readdir)(args->a_vp,
2023                                         args->a_nfs_rdc, args->a_cred));
2024                 } else if (args->a_io == NFS_COMMIT) {
2025                         (*args->a_nfs_commit)(args->a_vp, args->a_nfs_plist,
2026                                         args->a_nfs_offset, args->a_nfs_count,
2027                                         args->a_cred);
2028                 } else if (args->a_io == NFS_INACTIVE) {
2029                         (*args->a_nfs_inactive)(args->a_vp, args->a_cred);
2030                 }
2031 
2032                 /*
2033                  * Now, release the vnode and free the credentials
2034                  * structure.
2035                  */
2036                 free_async_args(args);
2037                 /*
2038                  * Reacquire the mutex because it will be needed above.
2039                  */
2040                 mutex_enter(&mi->mi_async_lock);
2041         }
2042 }
2043 
2044 void
2045 nfs_async_stop(struct vfs *vfsp)
2046 {
2047         mntinfo_t *mi = VFTOMI(vfsp);
2048 
2049         /*
2050          * Wait for all outstanding async operations to complete and for the
2051          * worker threads to exit.
2052          */
2053         mutex_enter(&mi->mi_async_lock);
2054         mi->mi_max_threads = 0;
2055         cv_broadcast(&mi->mi_async_work_cv);
2056         while (mi->mi_threads != 0)
2057                 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
2058         mutex_exit(&mi->mi_async_lock);
2059 }
2060 
2061 /*
2062  * nfs_async_stop_sig:
2063  * Wait for all outstanding putpage operation to complete. If a signal
2064  * is deliver we will abort and return non-zero. If we can put all the
2065  * pages we will return 0. This routine is called from nfs_unmount and
2066  * nfs3_unmount to make these operations interruptable.
2067  */
2068 int
2069 nfs_async_stop_sig(struct vfs *vfsp)
2070 {
2071         mntinfo_t *mi = VFTOMI(vfsp);
2072         ushort_t omax;
2073         int rval;
2074 
2075         /*
2076          * Wait for all outstanding async operations to complete and for the
2077          * worker threads to exit.
2078          */
2079         mutex_enter(&mi->mi_async_lock);
2080         omax = mi->mi_max_threads;
2081         mi->mi_max_threads = 0;
2082         /*
2083          * Tell all the worker threads to exit.
2084          */
2085         cv_broadcast(&mi->mi_async_work_cv);
2086         while (mi->mi_threads != 0) {
2087                 if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock))
2088                         break;
2089         }
2090         rval = (mi->mi_threads != 0);        /* Interrupted */
2091         if (rval)
2092                 mi->mi_max_threads = omax;
2093         mutex_exit(&mi->mi_async_lock);
2094 
2095         return (rval);
2096 }
2097 
2098 int
2099 writerp(rnode_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated)
2100 {
2101         int pagecreate;
2102         int n;
2103         int saved_n;
2104         caddr_t saved_base;
2105         u_offset_t offset;
2106         int error;
2107         int sm_error;
2108 
2109         ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid);
2110         ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE);
2111         ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER));
2112 
2113         /*
2114          * Move bytes in at most PAGESIZE chunks. We must avoid
2115          * spanning pages in uiomove() because page faults may cause
2116          * the cache to be invalidated out from under us. The r_size is not
2117          * updated until after the uiomove. If we push the last page of a
2118          * file before r_size is correct, we will lose the data written past
2119          * the current (and invalid) r_size.
2120          */
2121         do {
2122                 offset = uio->uio_loffset;
2123                 pagecreate = 0;
2124 
2125                 /*
2126                  * n is the number of bytes required to satisfy the request
2127                  *   or the number of bytes to fill out the page.
2128                  */
2129                 n = (int)MIN((PAGESIZE - ((uintptr_t)base & PAGEOFFSET)),
2130                     tcount);
2131 
2132                 /*
2133                  * Check to see if we can skip reading in the page
2134                  * and just allocate the memory.  We can do this
2135                  * if we are going to rewrite the entire mapping
2136                  * or if we are going to write to or beyond the current
2137                  * end of file from the beginning of the mapping.
2138                  *
2139                  * The read of r_size is now protected by r_statelock.
2140                  */
2141                 mutex_enter(&rp->r_statelock);
2142                 /*
2143                  * When pgcreated is nonzero the caller has already done
2144                  * a segmap_getmapflt with forcefault 0 and S_WRITE. With
2145                  * segkpm this means we already have at least one page
2146                  * created and mapped at base.
2147                  */
2148                 pagecreate = pgcreated ||
2149                         (((uintptr_t)base & PAGEOFFSET) == 0 &&
2150                         (n == PAGESIZE || ((offset + n) >= rp->r_size)));
2151 
2152                 mutex_exit(&rp->r_statelock);
2153                 if (pagecreate) {
2154                         /*
2155                          * The last argument tells segmap_pagecreate() to
2156                          * always lock the page, as opposed to sometimes
2157                          * returning with the page locked. This way we avoid a
2158                          * fault on the ensuing uiomove(), but also
2159                          * more importantly (to fix bug 1094402) we can
2160                          * call segmap_fault() to unlock the page in all
2161                          * cases. An alternative would be to modify
2162                          * segmap_pagecreate() to tell us when it is
2163                          * locking a page, but that's a fairly major
2164                          * interface change.
2165                          */
2166                         if (pgcreated == 0)
2167                                 (void) segmap_pagecreate(segkmap, base,
2168                                                         (uint_t)n, 1);
2169                         saved_base = base;
2170                         saved_n = n;
2171                 }
2172 
2173                 /*
2174                  * The number of bytes of data in the last page can not
2175                  * be accurately be determined while page is being
2176                  * uiomove'd to and the size of the file being updated.
2177                  * Thus, inform threads which need to know accurately
2178                  * how much data is in the last page of the file.  They
2179                  * will not do the i/o immediately, but will arrange for
2180                  * the i/o to happen later when this modify operation
2181                  * will have finished.
2182                  */
2183                 ASSERT(!(rp->r_flags & RMODINPROGRESS));
2184                 mutex_enter(&rp->r_statelock);
2185                 rp->r_flags |= RMODINPROGRESS;
2186                 rp->r_modaddr = (offset & MAXBMASK);
2187                 mutex_exit(&rp->r_statelock);
2188 
2189                 error = uiomove(base, n, UIO_WRITE, uio);
2190 
2191                 /*
2192                  * r_size is the maximum number of
2193                  * bytes known to be in the file.
2194                  * Make sure it is at least as high as the
2195                  * first unwritten byte pointed to by uio_loffset.
2196                  */
2197                 mutex_enter(&rp->r_statelock);
2198                 if (rp->r_size < uio->uio_loffset)
2199                         rp->r_size = uio->uio_loffset;
2200                 rp->r_flags &= ~RMODINPROGRESS;
2201                 rp->r_flags |= RDIRTY;
2202                 mutex_exit(&rp->r_statelock);
2203 
2204                 /* n = # of bytes written */
2205                 n = (int)(uio->uio_loffset - offset);
2206                 base += n;
2207                 tcount -= n;
2208                 /*
2209                  * If we created pages w/o initializing them completely,
2210                  * we need to zero the part that wasn't set up.
2211                  * This happens on a most EOF write cases and if
2212                  * we had some sort of error during the uiomove.
2213                  */
2214                 if (pagecreate) {
2215                         if ((uio->uio_loffset & PAGEOFFSET) || n == 0)
2216                                 (void) kzero(base, PAGESIZE - n);
2217 
2218                         if (pgcreated) {
2219                                 /*
2220                                  * Caller is responsible for this page,
2221                                  * it was not created in this loop.
2222                                  */
2223                                 pgcreated = 0;
2224                         } else {
2225                                 /*
2226                                  * For bug 1094402: segmap_pagecreate locks
2227                                  * page. Unlock it. This also unlocks the
2228                                  * pages allocated by page_create_va() in
2229                                  * segmap_pagecreate().
2230                                  */
2231                                 sm_error = segmap_fault(kas.a_hat, segkmap,
2232                                                 saved_base, saved_n,
2233                                                 F_SOFTUNLOCK, S_WRITE);
2234                                 if (error == 0)
2235                                         error = sm_error;
2236                         }
2237                 }
2238         } while (tcount > 0 && error == 0);
2239 
2240         return (error);
2241 }
2242 
2243 int
2244 nfs_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr)
2245 {
2246         rnode_t *rp;
2247         page_t *pp;
2248         u_offset_t eoff;
2249         u_offset_t io_off;
2250         size_t io_len;
2251         int error;
2252         int rdirty;
2253         int err;
2254 
2255         rp = VTOR(vp);
2256         ASSERT(rp->r_count > 0);
2257 
2258         if (!vn_has_cached_data(vp))
2259                 return (0);
2260 
2261         ASSERT(vp->v_type != VCHR);
2262 
2263         /*
2264          * If ROUTOFSPACE is set, then all writes turn into B_INVAL
2265          * writes.  B_FORCE is set to force the VM system to actually
2266          * invalidate the pages, even if the i/o failed.  The pages
2267          * need to get invalidated because they can't be written out
2268          * because there isn't any space left on either the server's
2269          * file system or in the user's disk quota.  The B_FREE bit
2270          * is cleared to avoid confusion as to whether this is a
2271          * request to place the page on the freelist or to destroy
2272          * it.
2273          */
2274         if ((rp->r_flags & ROUTOFSPACE) ||
2275             (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
2276                 flags = (flags & ~B_FREE) | B_INVAL | B_FORCE;
2277 
2278         if (len == 0) {
2279                 /*
2280                  * If doing a full file synchronous operation, then clear
2281                  * the RDIRTY bit.  If a page gets dirtied while the flush
2282                  * is happening, then RDIRTY will get set again.  The
2283                  * RDIRTY bit must get cleared before the flush so that
2284                  * we don't lose this information.
2285                  */
2286                 if (off == (u_offset_t)0 &&
2287                     !(flags & B_ASYNC) &&
2288                     (rp->r_flags & RDIRTY)) {
2289                         mutex_enter(&rp->r_statelock);
2290                         rdirty = (rp->r_flags & RDIRTY);
2291                         rp->r_flags &= ~RDIRTY;
2292                         mutex_exit(&rp->r_statelock);
2293                 } else
2294                         rdirty = 0;
2295 
2296                 /*
2297                  * Search the entire vp list for pages >= off, and flush
2298                  * the dirty pages.
2299                  */
2300                 error = pvn_vplist_dirty(vp, off, rp->r_putapage,
2301                                         flags, cr);
2302 
2303                 /*
2304                  * If an error occured and the file was marked as dirty
2305                  * before and we aren't forcibly invalidating pages, then
2306                  * reset the RDIRTY flag.
2307                  */
2308                 if (error && rdirty &&
2309                     (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) {
2310                         mutex_enter(&rp->r_statelock);
2311                         rp->r_flags |= RDIRTY;
2312                         mutex_exit(&rp->r_statelock);
2313                 }
2314         } else {
2315                 /*
2316                  * Do a range from [off...off + len) looking for pages
2317                  * to deal with.
2318                  */
2319                 error = 0;
2320 #ifdef lint
2321                 io_len = 0;
2322 #endif
2323                 eoff = off + len;
2324                 mutex_enter(&rp->r_statelock);
2325                 for (io_off = off; io_off < eoff && io_off < rp->r_size;
2326                     io_off += io_len) {
2327                         mutex_exit(&rp->r_statelock);
2328                         /*
2329                          * If we are not invalidating, synchronously
2330                          * freeing or writing pages use the routine
2331                          * page_lookup_nowait() to prevent reclaiming
2332                          * them from the free list.
2333                          */
2334                         if ((flags & B_INVAL) || !(flags & B_ASYNC)) {
2335                                 pp = page_lookup(vp, io_off,
2336                                     (flags & (B_INVAL | B_FREE)) ?
2337                                     SE_EXCL : SE_SHARED);
2338                         } else {
2339                                 pp = page_lookup_nowait(vp, io_off,
2340                                     (flags & B_FREE) ? SE_EXCL : SE_SHARED);
2341                         }
2342 
2343                         if (pp == NULL || !pvn_getdirty(pp, flags))
2344                                 io_len = PAGESIZE;
2345                         else {
2346                                 err = (*rp->r_putapage)(vp, pp, &io_off,
2347                                     &io_len, flags, cr);
2348                                 if (!error)
2349                                         error = err;
2350                                 /*
2351                                  * "io_off" and "io_len" are returned as
2352                                  * the range of pages we actually wrote.
2353                                  * This allows us to skip ahead more quickly
2354                                  * since several pages may've been dealt
2355                                  * with by this iteration of the loop.
2356                                  */
2357                         }
2358                         mutex_enter(&rp->r_statelock);
2359                 }
2360                 mutex_exit(&rp->r_statelock);
2361         }
2362 
2363         return (error);
2364 }
2365 
2366 void
2367 nfs_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr)
2368 {
2369         rnode_t *rp;
2370 
2371         rp = VTOR(vp);
2372         mutex_enter(&rp->r_statelock);
2373         while (rp->r_flags & RTRUNCATE)
2374                 cv_wait(&rp->r_cv, &rp->r_statelock);
2375         rp->r_flags |= RTRUNCATE;
2376         if (off == (u_offset_t)0) {
2377                 rp->r_flags &= ~RDIRTY;
2378                 if (!(rp->r_flags & RSTALE))
2379                         rp->r_error = 0;
2380         }
2381         rp->r_truncaddr = off;
2382         mutex_exit(&rp->r_statelock);
2383         (void) pvn_vplist_dirty(vp, off, rp->r_putapage,
2384                 B_INVAL | B_TRUNC, cr);
2385         mutex_enter(&rp->r_statelock);
2386         rp->r_flags &= ~RTRUNCATE;
2387         cv_broadcast(&rp->r_cv);
2388         mutex_exit(&rp->r_statelock);
2389 }
2390 
2391 static int nfs_write_error_to_cons_only = 0;
2392 #define MSG(x)  (nfs_write_error_to_cons_only ? (x) : (x) + 1)
2393 
2394 /*
2395  * Print a file handle
2396  */
2397 void
2398 nfs_printfhandle(nfs_fhandle *fhp)
2399 {
2400         int *ip;
2401         char *buf;
2402         size_t bufsize;
2403         char *cp;
2404 
2405         /*
2406          * 13 == "(file handle:"
2407          * maximum of NFS_FHANDLE / sizeof (*ip) elements in fh_buf times
2408          *      1 == ' '
2409          *      8 == maximum strlen of "%x"
2410          * 3 == ")\n\0"
2411          */
2412         bufsize = 13 + ((NFS_FHANDLE_LEN / sizeof (*ip)) * (1 + 8)) + 3;
2413         buf = kmem_alloc(bufsize, KM_NOSLEEP);
2414         if (buf == NULL)
2415                 return;
2416 
2417         cp = buf;
2418         (void) strcpy(cp, "(file handle:");
2419         while (*cp != '\0')
2420                 cp++;
2421         for (ip = (int *)fhp->fh_buf;
2422             ip < (int *)&fhp->fh_buf[fhp->fh_len];
2423             ip++) {
2424                 (void) sprintf(cp, " %x", *ip);
2425                 while (*cp != '\0')
2426                         cp++;
2427         }
2428         (void) strcpy(cp, ")\n");
2429 
2430         zcmn_err(getzoneid(), CE_CONT, MSG("^%s"), buf);
2431 
2432         kmem_free(buf, bufsize);
2433 }
2434 
2435 /*
2436  * Notify the system administrator that an NFS write error has
2437  * occurred.
2438  */
2439 
2440 /* seconds between ENOSPC/EDQUOT messages */
2441 clock_t nfs_write_error_interval = 5;
2442 
2443 void
2444 nfs_write_error(vnode_t *vp, int error, cred_t *cr)
2445 {
2446         mntinfo_t *mi;
2447 
2448         mi = VTOMI(vp);
2449         /*
2450          * In case of forced unmount or zone shutdown, do not print any
2451          * messages since it can flood the console with error messages.
2452          */
2453         if (FS_OR_ZONE_GONE(mi->mi_vfsp))
2454                 return;
2455 
2456         /*
2457          * No use in flooding the console with ENOSPC
2458          * messages from the same file system.
2459          */
2460         if ((error != ENOSPC && error != EDQUOT) ||
2461             lbolt - mi->mi_printftime > 0) {
2462                 zoneid_t zoneid = mi->mi_zone->zone_id;
2463 
2464 #ifdef DEBUG
2465                 nfs_perror(error, "NFS%ld write error on host %s: %m.\n",
2466                     mi->mi_vers, VTOR(vp)->r_server->sv_hostname, NULL);
2467 #else
2468                 nfs_perror(error, "NFS write error on host %s: %m.\n",
2469                     VTOR(vp)->r_server->sv_hostname, NULL);
2470 #endif
2471                 if (error == ENOSPC || error == EDQUOT) {
2472                         zcmn_err(zoneid, CE_CONT,
2473                             MSG("^File: userid=%d, groupid=%d\n"),
2474                             crgetuid(cr), crgetgid(cr));
2475                         if (crgetuid(CRED()) != crgetuid(cr) ||
2476                             crgetgid(CRED()) != crgetgid(cr)) {
2477                                 zcmn_err(zoneid, CE_CONT,
2478                                     MSG("^User: userid=%d, groupid=%d\n"),
2479                                     crgetuid(CRED()), crgetgid(CRED()));
2480                         }
2481                         mi->mi_printftime = lbolt +
2482                             nfs_write_error_interval * hz;
2483                 }
2484                 nfs_printfhandle(&VTOR(vp)->r_fh);
2485 #ifdef DEBUG
2486                 if (error == EACCES) {
2487                         zcmn_err(zoneid, CE_CONT,
2488                             MSG("^nfs_bio: cred is%s kcred\n"),
2489                             cr == kcred ? "" : " not");
2490                 }
2491 #endif
2492         }
2493 }
2494 
2495 /* ARGSUSED */
2496 static void *
2497 nfs_mi_init(zoneid_t zoneid)
2498 {
2499         struct mi_globals *mig;
2500 
2501         mig = kmem_alloc(sizeof (*mig), KM_SLEEP);
2502         mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL);
2503         list_create(&mig->mig_list, sizeof (mntinfo_t),
2504             offsetof(mntinfo_t, mi_zone_node));
2505         mig->mig_destructor_called = B_FALSE;
2506         return (mig);
2507 }
2508 
2509 /*
2510  * Callback routine to tell all NFS mounts in the zone to stop creating new
2511  * threads.  Existing threads should exit.
2512  */
2513 /* ARGSUSED */
2514 static void
2515 nfs_mi_shutdown(zoneid_t zoneid, void *data)
2516 {
2517         struct mi_globals *mig = data;
2518         mntinfo_t *mi;
2519 
2520         ASSERT(mig != NULL);
2521 again:
2522         mutex_enter(&mig->mig_lock);
2523         for (mi = list_head(&mig->mig_list); mi != NULL;
2524             mi = list_next(&mig->mig_list, mi)) {
2525 
2526                 /*
2527                  * If we've done the shutdown work for this FS, skip.
2528                  * Once we go off the end of the list, we're done.
2529                  */
2530                 if (mi->mi_flags & MI_DEAD)
2531                         continue;
2532 
2533                 /*
2534                  * We will do work, so not done.  Get a hold on the FS.
2535                  */
2536                 VFS_HOLD(mi->mi_vfsp);
2537 
2538                 /*
2539                  * purge the DNLC for this filesystem
2540                  */
2541                 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
2542 
2543                 mutex_enter(&mi->mi_async_lock);
2544                 /*
2545                  * Tell existing async worker threads to exit.
2546                  */
2547                 mi->mi_max_threads = 0;
2548                 cv_broadcast(&mi->mi_async_work_cv);
2549                 /*
2550                  * Set MI_ASYNC_MGR_STOP so the async manager thread starts
2551                  * getting ready to exit when it's done with its current work.
2552                  * Also set MI_DEAD to note we've acted on this FS.
2553                  */
2554                 mutex_enter(&mi->mi_lock);
2555                 mi->mi_flags |= (MI_ASYNC_MGR_STOP|MI_DEAD);
2556                 mutex_exit(&mi->mi_lock);
2557                 /*
2558                  * Wake up the async manager thread.
2559                  */
2560                 cv_broadcast(&mi->mi_async_reqs_cv);
2561                 mutex_exit(&mi->mi_async_lock);
2562 
2563                 /*
2564                  * Drop lock and release FS, which may change list, then repeat.
2565                  * We're done when every mi has been done or the list is empty.
2566                  */
2567                 mutex_exit(&mig->mig_lock);
2568                 VFS_RELE(mi->mi_vfsp);
2569                 goto again;
2570         }
2571         mutex_exit(&mig->mig_lock);
2572 }
2573 
2574 static void
2575 nfs_mi_free_globals(struct mi_globals *mig)
2576 {
2577         list_destroy(&mig->mig_list);    /* makes sure the list is empty */
2578         mutex_destroy(&mig->mig_lock);
2579         kmem_free(mig, sizeof (*mig));
2580 
2581 }
2582 
2583 /* ARGSUSED */
2584 static void
2585 nfs_mi_destroy(zoneid_t zoneid, void *data)
2586 {
2587         struct mi_globals *mig = data;
2588 
2589         ASSERT(mig != NULL);
2590         mutex_enter(&mig->mig_lock);
2591         if (list_head(&mig->mig_list) != NULL) {
2592                 /* Still waiting for VFS_FREEVFS() */
2593                 mig->mig_destructor_called = B_TRUE;
2594                 mutex_exit(&mig->mig_lock);
2595                 return;
2596         }
2597         nfs_mi_free_globals(mig);
2598 }
2599 
2600 /*
2601  * Add an NFS mount to the per-zone list of NFS mounts.
2602  */
2603 void
2604 nfs_mi_zonelist_add(mntinfo_t *mi)
2605 {
2606         struct mi_globals *mig;
2607 
2608         mig = zone_getspecific(mi_list_key, mi->mi_zone);
2609         mutex_enter(&mig->mig_lock);
2610         list_insert_head(&mig->mig_list, mi);
2611         mutex_exit(&mig->mig_lock);
2612 }
2613 
2614 /*
2615  * Remove an NFS mount from the per-zone list of NFS mounts.
2616  */
2617 static void
2618 nfs_mi_zonelist_remove(mntinfo_t *mi)
2619 {
2620         struct mi_globals *mig;
2621 
2622         mig = zone_getspecific(mi_list_key, mi->mi_zone);
2623         mutex_enter(&mig->mig_lock);
2624         list_remove(&mig->mig_list, mi);
2625         /*
2626          * We can be called asynchronously by VFS_FREEVFS() after the zone
2627          * shutdown/destroy callbacks have executed; if so, clean up the zone's
2628          * mi globals.
2629          */
2630         if (list_head(&mig->mig_list) == NULL &&
2631             mig->mig_destructor_called == B_TRUE) {
2632                 nfs_mi_free_globals(mig);
2633                 return;
2634         }
2635         mutex_exit(&mig->mig_lock);
2636 }
2637 
2638 /*
2639  * NFS Client initialization routine.  This routine should only be called
2640  * once.  It performs the following tasks:
2641  *      - Initalize all global locks
2642  *      - Call sub-initialization routines (localize access to variables)
2643  */
2644 int
2645 nfs_clntinit(void)
2646 {
2647 #ifdef DEBUG
2648         static boolean_t nfs_clntup = B_FALSE;
2649 #endif
2650         int error;
2651 
2652 #ifdef DEBUG
2653         ASSERT(nfs_clntup == B_FALSE);
2654 #endif
2655 
2656         error = nfs_subrinit();
2657         if (error)
2658                 return (error);
2659 
2660         error = nfs_vfsinit();
2661         if (error) {
2662                 /*
2663                  * Cleanup nfs_subrinit() work
2664                  */
2665                 nfs_subrfini();
2666                 return (error);
2667         }
2668         zone_key_create(&mi_list_key, nfs_mi_init, nfs_mi_shutdown,
2669             nfs_mi_destroy);
2670 
2671         nfs4_clnt_init();
2672 
2673 #ifdef DEBUG
2674         nfs_clntup = B_TRUE;
2675 #endif
2676 
2677         return (0);
2678 }
2679 
2680 /*
2681  * This routine is only called if the NFS Client has been initialized but
2682  * the module failed to be installed. This routine will cleanup the previously
2683  * allocated/initialized work.
2684  */
2685 void
2686 nfs_clntfini(void)
2687 {
2688         (void) zone_key_delete(mi_list_key);
2689         nfs_subrfini();
2690         nfs_vfsfini();
2691         nfs4_clnt_fini();
2692 }
2693 
2694 /*
2695  * nfs_lockrelease:
2696  *
2697  * Release any locks on the given vnode that are held by the current
2698  * process.
2699  */
2700 void
2701 nfs_lockrelease(vnode_t *vp, int flag, offset_t offset, cred_t *cr)
2702 {
2703         flock64_t ld;
2704         struct shrlock shr;
2705         char *buf;
2706         int remote_lock_possible;
2707         int ret;
2708 
2709         ASSERT((uintptr_t)vp > KERNELBASE);
2710 
2711         /*
2712          * Generate an explicit unlock operation for the entire file.  As a
2713          * partial optimization, only generate the unlock if there is a
2714          * lock registered for the file.  We could check whether this
2715          * particular process has any locks on the file, but that would
2716          * require the local locking code to provide yet another query
2717          * routine.  Note that no explicit synchronization is needed here.
2718          * At worst, flk_has_remote_locks() will return a false positive,
2719          * in which case the unlock call wastes time but doesn't harm
2720          * correctness.
2721          *
2722          * In addition, an unlock request is generated if the process
2723          * is listed as possibly having a lock on the file because the
2724          * server and client lock managers may have gotten out of sync.
2725          * N.B. It is important to make sure nfs_remove_locking_id() is
2726          * called here even if flk_has_remote_locks(vp) reports true.
2727          * If it is not called and there is an entry on the process id
2728          * list, that entry will never get removed.
2729          */
2730         remote_lock_possible = nfs_remove_locking_id(vp, RLMPL_PID,
2731             (char *)&(ttoproc(curthread)->p_pid), NULL, NULL);
2732         if (remote_lock_possible || flk_has_remote_locks(vp)) {
2733                 ld.l_type = F_UNLCK;    /* set to unlock entire file */
2734                 ld.l_whence = 0;        /* unlock from start of file */
2735                 ld.l_start = 0;
2736                 ld.l_len = 0;           /* do entire file */
2737                 ret = VOP_FRLOCK(vp, F_SETLK, &ld, flag, offset, NULL, cr);
2738 
2739                 if (ret != 0) {
2740                         /*
2741                          * If VOP_FRLOCK fails, make sure we unregister
2742                          * local locks before we continue.
2743                          */
2744                         ld.l_pid = ttoproc(curthread)->p_pid;
2745                         lm_register_lock_locally(vp, NULL, &ld, flag, offset);
2746 #ifdef DEBUG
2747                         nfs_perror(ret,
2748                             "NFS lock release error on vp %p: %m.\n",
2749                             (void *)vp, NULL);
2750 #endif
2751                 }
2752 
2753                 /*
2754                  * The call to VOP_FRLOCK may put the pid back on the
2755                  * list.  We need to remove it.
2756                  */
2757                 (void) nfs_remove_locking_id(vp, RLMPL_PID,
2758                     (char *)&(ttoproc(curthread)->p_pid), NULL, NULL);
2759         }
2760 
2761         /*
2762          * As long as the vp has a share matching our pid,
2763          * pluck it off and unshare it.  There are circumstances in
2764          * which the call to nfs_remove_locking_id() may put the
2765          * owner back on the list, in which case we simply do a
2766          * redundant and harmless unshare.
2767          */
2768         buf = kmem_alloc(MAX_SHR_OWNER_LEN, KM_SLEEP);
2769         while (nfs_remove_locking_id(vp, RLMPL_OWNER,
2770             (char *)NULL, buf, &shr.s_own_len)) {
2771                 shr.s_owner = buf;
2772                 shr.s_access = 0;
2773                 shr.s_deny = 0;
2774                 shr.s_sysid = 0;
2775                 shr.s_pid = curproc->p_pid;
2776 
2777                 ret = VOP_SHRLOCK(vp, F_UNSHARE, &shr, flag, cr);
2778 #ifdef DEBUG
2779                 if (ret != 0) {
2780                         nfs_perror(ret,
2781                             "NFS share release error on vp %p: %m.\n",
2782                             (void *)vp, NULL);
2783                 }
2784 #endif
2785         }
2786         kmem_free(buf, MAX_SHR_OWNER_LEN);
2787 }
2788 
2789 /*
2790  * nfs_lockcompletion:
2791  *
2792  * If the vnode has a lock that makes it unsafe to cache the file, mark it
2793  * as non cachable (set VNOCACHE bit).
2794  */
2795 
2796 void
2797 nfs_lockcompletion(vnode_t *vp, int cmd)
2798 {
2799 #ifdef DEBUG
2800         rnode_t *rp = VTOR(vp);
2801 
2802         ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
2803 #endif
2804 
2805         if (cmd == F_SETLK || cmd == F_SETLKW) {
2806                 if (!lm_safemap(vp)) {
2807                         mutex_enter(&vp->v_lock);
2808                         vp->v_flag |= VNOCACHE;
2809                         mutex_exit(&vp->v_lock);
2810                 } else {
2811                         mutex_enter(&vp->v_lock);
2812                         vp->v_flag &= ~VNOCACHE;
2813                         mutex_exit(&vp->v_lock);
2814                 }
2815         }
2816         /*
2817          * The cached attributes of the file are stale after acquiring
2818          * the lock on the file. They were updated when the file was
2819          * opened, but not updated when the lock was acquired. Therefore the
2820          * cached attributes are invalidated after the lock is obtained.
2821          */
2822         PURGE_ATTRCACHE(vp);
2823 }
2824 
2825 /*
2826  * The lock manager holds state making it possible for the client
2827  * and server to be out of sync.  For example, if the response from
2828  * the server granting a lock request is lost, the server will think
2829  * the lock is granted and the client will think the lock is lost.
2830  * The client can tell when it is not positive if it is in sync with
2831  * the server.
2832  *
2833  * To deal with this, a list of processes for which the client is
2834  * not sure if the server holds a lock is attached to the rnode.
2835  * When such a process closes the rnode, an unlock request is sent
2836  * to the server to unlock the entire file.
2837  *
2838  * The list is kept as a singularly linked NULL terminated list.
2839  * Because it is only added to under extreme error conditions, the
2840  * list shouldn't get very big.  DEBUG kernels print a message if
2841  * the list gets bigger than nfs_lmpl_high_water.  This is arbitrarily
2842  * choosen to be 8, but can be tuned at runtime.
2843  */
2844 #ifdef DEBUG
2845 /* int nfs_lmpl_high_water = 8; */
2846 int nfs_lmpl_high_water = 128;
2847 int nfs_cnt_add_locking_id = 0;
2848 int nfs_len_add_locking_id = 0;
2849 #endif /* DEBUG */
2850 
2851 /*
2852  * Record that the nfs lock manager server may be holding a lock on
2853  * a vnode for a process.
2854  *
2855  * Because the nfs lock manager server holds state, it is possible
2856  * for the server to get out of sync with the client.  This routine is called
2857  * from the client when it is no longer sure if the server is in sync
2858  * with the client.  nfs_lockrelease() will then notice this and send
2859  * an unlock request when the file is closed
2860  */
2861 void
2862 nfs_add_locking_id(vnode_t *vp, pid_t pid, int type, char *id, int len)
2863 {
2864         rnode_t *rp;
2865         lmpl_t *new;
2866         lmpl_t *cur;
2867         lmpl_t **lmplp;
2868 #ifdef DEBUG
2869         int list_len = 1;
2870 #endif /* DEBUG */
2871 
2872 #ifdef DEBUG
2873         ++nfs_cnt_add_locking_id;
2874 #endif /* DEBUG */
2875         /*
2876          * allocate new lmpl_t now so we don't sleep
2877          * later after grabbing mutexes
2878          */
2879         ASSERT(len < MAX_SHR_OWNER_LEN);
2880         new = kmem_alloc(sizeof (*new), KM_SLEEP);
2881         new->lmpl_type = type;
2882         new->lmpl_pid = pid;
2883         new->lmpl_owner = kmem_alloc(len, KM_SLEEP);
2884         bcopy(id, new->lmpl_owner, len);
2885         new->lmpl_own_len = len;
2886         new->lmpl_next = (lmpl_t *)NULL;
2887 #ifdef DEBUG
2888         if (type == RLMPL_PID) {
2889                 ASSERT(len == sizeof (pid_t));
2890                 ASSERT(pid == *(pid_t *)new->lmpl_owner);
2891         } else {
2892                 ASSERT(type == RLMPL_OWNER);
2893         }
2894 #endif
2895 
2896         rp = VTOR(vp);
2897         mutex_enter(&rp->r_statelock);
2898 
2899         /*
2900          * Add this id to the list for this rnode only if the
2901          * rnode is active and the id is not already there.
2902          */
2903         ASSERT(rp->r_flags & RHASHED);
2904         lmplp = &(rp->r_lmpl);
2905         for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; cur = cur->lmpl_next) {
2906                 if (cur->lmpl_pid == pid &&
2907                     cur->lmpl_type == type &&
2908                     cur->lmpl_own_len == len &&
2909                     bcmp(cur->lmpl_owner, new->lmpl_owner, len) == 0) {
2910                         kmem_free(new->lmpl_owner, len);
2911                         kmem_free(new, sizeof (*new));
2912                         break;
2913                 }
2914                 lmplp = &cur->lmpl_next;
2915 #ifdef DEBUG
2916                 ++list_len;
2917 #endif /* DEBUG */
2918         }
2919         if (cur == (lmpl_t *)NULL) {
2920                 *lmplp = new;
2921 #ifdef DEBUG
2922                 if (list_len > nfs_len_add_locking_id) {
2923                         nfs_len_add_locking_id = list_len;
2924                 }
2925                 if (list_len > nfs_lmpl_high_water) {
2926                         cmn_err(CE_WARN, "nfs_add_locking_id: long list "
2927                             "vp=%p is %d", (void *)vp, list_len);
2928                 }
2929 #endif /* DEBUG */
2930         }
2931 
2932 #ifdef DEBUG
2933         if (share_debug) {
2934                 int nitems = 0;
2935                 int npids = 0;
2936                 int nowners = 0;
2937 
2938                 /*
2939                  * Count the number of things left on r_lmpl after the remove.
2940                  */
2941                 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL;
2942                     cur = cur->lmpl_next) {
2943                         nitems++;
2944                         if (cur->lmpl_type == RLMPL_PID) {
2945                                 npids++;
2946                         } else if (cur->lmpl_type == RLMPL_OWNER) {
2947                                 nowners++;
2948                         } else {
2949                                 cmn_err(CE_PANIC, "nfs_add_locking_id: "
2950                                     "unrecognised lmpl_type %d",
2951                                     cur->lmpl_type);
2952                         }
2953                 }
2954 
2955                 cmn_err(CE_CONT, "nfs_add_locking_id(%s): %d PIDs + %d "
2956                     "OWNs = %d items left on r_lmpl\n",
2957                     (type == RLMPL_PID) ? "P" : "O", npids, nowners, nitems);
2958         }
2959 #endif
2960 
2961         mutex_exit(&rp->r_statelock);
2962 }
2963 
2964 /*
2965  * Remove an id from the lock manager id list.
2966  *
2967  * If the id is not in the list return 0.  If it was found and
2968  * removed, return 1.
2969  */
2970 static int
2971 nfs_remove_locking_id(vnode_t *vp, int type, char *id, char *rid, int *rlen)
2972 {
2973         lmpl_t *cur;
2974         lmpl_t **lmplp;
2975         rnode_t *rp;
2976         int rv = 0;
2977 
2978         ASSERT(type == RLMPL_PID || type == RLMPL_OWNER);
2979 
2980         rp = VTOR(vp);
2981 
2982         mutex_enter(&rp->r_statelock);
2983         ASSERT(rp->r_flags & RHASHED);
2984         lmplp = &(rp->r_lmpl);
2985 
2986         /*
2987          * Search through the list and remove the entry for this id
2988          * if it is there.  The special case id == NULL allows removal
2989          * of the first share on the r_lmpl list belonging to the
2990          * current process (if any), without regard to further details
2991          * of its identity.
2992          */
2993         for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; cur = cur->lmpl_next) {
2994                 if (cur->lmpl_type == type &&
2995                     cur->lmpl_pid == curproc->p_pid &&
2996                     (id == (char *)NULL ||
2997                     bcmp(cur->lmpl_owner, id, cur->lmpl_own_len) == 0)) {
2998                         *lmplp = cur->lmpl_next;
2999                         ASSERT(cur->lmpl_own_len < MAX_SHR_OWNER_LEN);
3000                         if (rid != NULL) {
3001                                 bcopy(cur->lmpl_owner, rid, cur->lmpl_own_len);
3002                                 *rlen = cur->lmpl_own_len;
3003                         }
3004                         kmem_free(cur->lmpl_owner, cur->lmpl_own_len);
3005                         kmem_free(cur, sizeof (*cur));
3006                         rv = 1;
3007                         break;
3008                 }
3009                 lmplp = &cur->lmpl_next;
3010         }
3011 
3012 #ifdef DEBUG
3013         if (share_debug) {
3014                 int nitems = 0;
3015                 int npids = 0;
3016                 int nowners = 0;
3017 
3018                 /*
3019                  * Count the number of things left on r_lmpl after the remove.
3020                  */
3021                 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL;
3022                                 cur = cur->lmpl_next) {
3023                         nitems++;
3024                         if (cur->lmpl_type == RLMPL_PID) {
3025                                 npids++;
3026                         } else if (cur->lmpl_type == RLMPL_OWNER) {
3027                                 nowners++;
3028                         } else {
3029                                 cmn_err(CE_PANIC,
3030                                         "nrli: unrecognised lmpl_type %d",
3031                                         cur->lmpl_type);
3032                         }
3033                 }
3034 
3035                 cmn_err(CE_CONT,
3036                 "nrli(%s): %d PIDs + %d OWNs = %d items left on r_lmpl\n",
3037                         (type == RLMPL_PID) ? "P" : "O",
3038                         npids,
3039                         nowners,
3040                         nitems);
3041         }
3042 #endif
3043 
3044         mutex_exit(&rp->r_statelock);
3045         return (rv);
3046 }
3047 
3048 void
3049 nfs_free_mi(mntinfo_t *mi)
3050 {
3051         ASSERT(mi->mi_flags & MI_ASYNC_MGR_STOP);
3052         ASSERT(mi->mi_manager_thread == NULL);
3053         ASSERT(mi->mi_threads == 0);
3054 
3055         /*
3056          * Remove the node from the global list before we start tearing it down.
3057          */
3058         nfs_mi_zonelist_remove(mi);
3059         if (mi->mi_klmconfig) {
3060                 lm_free_config(mi->mi_klmconfig);
3061                 kmem_free(mi->mi_klmconfig, sizeof (struct knetconfig));
3062         }
3063         mutex_destroy(&mi->mi_lock);
3064         mutex_destroy(&mi->mi_remap_lock);
3065         mutex_destroy(&mi->mi_async_lock);
3066         cv_destroy(&mi->mi_failover_cv);
3067         cv_destroy(&mi->mi_async_work_cv);
3068         cv_destroy(&mi->mi_async_reqs_cv);
3069         cv_destroy(&mi->mi_async_cv);
3070         zone_rele(mi->mi_zone);
3071         kmem_free(mi, sizeof (*mi));
3072 }
3073 
3074 static int
3075 mnt_kstat_update(kstat_t *ksp, int rw)
3076 {
3077         mntinfo_t *mi;
3078         struct mntinfo_kstat *mik;
3079         vfs_t *vfsp;
3080         int i;
3081 
3082         /* this is a read-only kstat. Bail out on a write */
3083         if (rw == KSTAT_WRITE)
3084                 return (EACCES);
3085 
3086         /*
3087          * We don't want to wait here as kstat_chain_lock could be held by
3088          * dounmount(). dounmount() takes vfs_reflock before the chain lock
3089          * and thus could lead to a deadlock.
3090          */
3091         vfsp = (struct vfs *)ksp->ks_private;
3092 
3093 
3094         mi = VFTOMI(vfsp);
3095 
3096         mik = (struct mntinfo_kstat *)ksp->ks_data;
3097 
3098         (void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto);
3099         mik->mik_vers = (uint32_t)mi->mi_vers;
3100         mik->mik_flags = mi->mi_flags;
3101         mik->mik_secmod = mi->mi_curr_serv->sv_secdata->secmod;
3102         mik->mik_curread = (uint32_t)mi->mi_curread;
3103         mik->mik_curwrite = (uint32_t)mi->mi_curwrite;
3104         mik->mik_retrans = mi->mi_retrans;
3105         mik->mik_timeo = mi->mi_timeo;
3106         mik->mik_acregmin = HR2SEC(mi->mi_acregmin);
3107         mik->mik_acregmax = HR2SEC(mi->mi_acregmax);
3108         mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin);
3109         mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax);
3110         for (i = 0; i < NFS_CALLTYPES + 1; i++) {
3111                 mik->mik_timers[i].srtt = (uint32_t)mi->mi_timers[i].rt_srtt;
3112                 mik->mik_timers[i].deviate =
3113                     (uint32_t)mi->mi_timers[i].rt_deviate;
3114                 mik->mik_timers[i].rtxcur =
3115                     (uint32_t)mi->mi_timers[i].rt_rtxcur;
3116         }
3117         mik->mik_noresponse = (uint32_t)mi->mi_noresponse;
3118         mik->mik_failover = (uint32_t)mi->mi_failover;
3119         mik->mik_remap = (uint32_t)mi->mi_remap;
3120         (void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname);
3121 
3122         return (0);
3123 }
3124 
3125 void
3126 nfs_mnt_kstat_init(struct vfs *vfsp)
3127 {
3128         mntinfo_t *mi = VFTOMI(vfsp);
3129 
3130         /*
3131          * Create the version specific kstats.
3132          *
3133          * PSARC 2001/697 Contract Private Interface
3134          * All nfs kstats are under SunMC contract
3135          * Please refer to the PSARC listed above and contact
3136          * SunMC before making any changes!
3137          *
3138          * Changes must be reviewed by Solaris File Sharing
3139          * Changes must be communicated to contract-2001-697@sun.com
3140          *
3141          */
3142 
3143         mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev),
3144             NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id);
3145         if (mi->mi_io_kstats) {
3146                 if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
3147                         kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID);
3148                 mi->mi_io_kstats->ks_lock = &mi->mi_lock;
3149                 kstat_install(mi->mi_io_kstats);
3150         }
3151 
3152         if ((mi->mi_ro_kstats = kstat_create_zone("nfs",
3153             getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW,
3154             sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) {
3155                 if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
3156                         kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID);
3157                 mi->mi_ro_kstats->ks_update = mnt_kstat_update;
3158                 mi->mi_ro_kstats->ks_private = (void *)vfsp;
3159                 kstat_install(mi->mi_ro_kstats);
3160         }
3161 }
3162 
3163 nfs_delmapcall_t *
3164 nfs_init_delmapcall()
3165 {
3166         nfs_delmapcall_t        *delmap_call;
3167 
3168         delmap_call = kmem_alloc(sizeof (nfs_delmapcall_t), KM_SLEEP);
3169         delmap_call->call_id = curthread;
3170         delmap_call->error = 0;
3171 
3172         return (delmap_call);
3173 }
3174 
3175 void
3176 nfs_free_delmapcall(nfs_delmapcall_t *delmap_call)
3177 {
3178         kmem_free(delmap_call, sizeof (nfs_delmapcall_t));
3179 }
3180 
3181 /*
3182  * Searches for the current delmap caller (based on curthread) in the list of
3183  * callers.  If it is found, we remove it and free the delmap caller.
3184  * Returns:
3185  *      0 if the caller wasn't found
3186  *      1 if the caller was found, removed and freed.  *errp is set to what
3187  *      the result of the delmap was.
3188  */
3189 int
3190 nfs_find_and_delete_delmapcall(rnode_t *rp, int *errp)
3191 {
3192         nfs_delmapcall_t        *delmap_call;
3193 
3194         /*
3195          * If the list doesn't exist yet, we create it and return
3196          * that the caller wasn't found.  No list = no callers.
3197          */
3198         mutex_enter(&rp->r_statelock);
3199         if (!(rp->r_flags & RDELMAPLIST)) {
3200                 /* The list does not exist */
3201                 list_create(&rp->r_indelmap, sizeof (nfs_delmapcall_t),
3202                     offsetof(nfs_delmapcall_t, call_node));
3203                 rp->r_flags |= RDELMAPLIST;
3204                 mutex_exit(&rp->r_statelock);
3205                 return (0);
3206         } else {
3207                 /* The list exists so search it */
3208                 for (delmap_call = list_head(&rp->r_indelmap);
3209                     delmap_call != NULL;
3210                     delmap_call = list_next(&rp->r_indelmap, delmap_call)) {
3211                         if (delmap_call->call_id == curthread) {
3212                                 /* current caller is in the list */
3213                                 *errp = delmap_call->error;
3214                                 list_remove(&rp->r_indelmap, delmap_call);
3215                                 mutex_exit(&rp->r_statelock);
3216                                 nfs_free_delmapcall(delmap_call);
3217                                 return (1);
3218                         }
3219                 }
3220         }
3221         mutex_exit(&rp->r_statelock);
3222         return (0);
3223 }