New nfs_client.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License"). You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22 /*
23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 *
26 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
27 * All rights reserved.
28 */
29
30 /* Copyright (c) 2006, The Ohio State University. All rights reserved.
31 *
32 * Portions of this source code is developed by the team members of
33 * The Ohio State University's Network-Based Computing Laboratory (NBCL),
34 * headed by Professor Dhabaleswar K. (DK) Panda.
35 *
36 * Acknowledgements to contributions from developors:
37 * Ranjit Noronha: noronha@cse.ohio-state.edu
38 * Lei Chai : chail@cse.ohio-state.edu
39 * Weikuan Yu : yuw@cse.ohio-state.edu
40 *
41 */
42
43 #pragma ident "@(#)nfs_client.c 1.193 05/10/11 SMI"
44
45 #include <sys/param.h>
46 #include <sys/types.h>
47 #include <sys/systm.h>
48 #include <sys/thread.h>
49 #include <sys/t_lock.h>
50 #include <sys/time.h>
51 #include <sys/vnode.h>
52 #include <sys/vfs.h>
53 #include <sys/errno.h>
54 #include <sys/buf.h>
55 #include <sys/stat.h>
56 #include <sys/cred.h>
57 #include <sys/kmem.h>
58 #include <sys/debug.h>
59 #include <sys/dnlc.h>
60 #include <sys/vmsystm.h>
61 #include <sys/flock.h>
62 #include <sys/share.h>
63 #include <sys/cmn_err.h>
64 #include <sys/tiuser.h>
65 #include <sys/sysmacros.h>
66 #include <sys/callb.h>
67 #include <sys/acl.h>
68 #include <sys/kstat.h>
69 #include <sys/signal.h>
70 #include <sys/list.h>
71 #include <sys/zone.h>
72
73 #include <rpc/types.h>
74 #include <rpc/xdr.h>
75 #include <rpc/auth.h>
76 #include <rpc/clnt.h>
77
78 #include <nfs/nfs.h>
79 #include <nfs/nfs_clnt.h>
80
81 #include <nfs/rnode.h>
82 #include <nfs/nfs_acl.h>
83 #include <nfs/lm.h>
84
85 #include <vm/hat.h>
86 #include <vm/as.h>
87 #include <vm/page.h>
88 #include <vm/pvn.h>
89 #include <vm/seg.h>
90 #include <vm/seg_map.h>
91 #include <vm/seg_vn.h>
92
93 static void nfs3_attr_cache(vnode_t *, vattr_t *, vattr_t *, hrtime_t,
94 cred_t *);
95 static int nfs_getattr_cache(vnode_t *, struct vattr *);
96 static int nfs_remove_locking_id(vnode_t *, int, char *, char *, int *);
97
98 struct mi_globals {
99 kmutex_t mig_lock; /* lock protecting mig_list */
100 list_t mig_list; /* list of NFS v2 or v3 mounts in zone */
101 boolean_t mig_destructor_called;
102 };
103
104 static zone_key_t mi_list_key;
105
106 /* Debugging flag for PC file shares. */
107 extern int share_debug;
108
109 /*
110 * used by RDMA transport to easily recognize READ3 call/reply
111 * (FTDO -- for the demo only. Better design needed for NFS4 or ON10 putback)
112 */
113
114 extern xdrproc_t x_READ3args;
115 extern xdrproc_t x_READ3res;
116 extern xdrproc_t x_READ3uiores;
117 extern xdrproc_t x_READ3vres;
118
119 /*
120 * Attributes caching:
121 *
122 * Attributes are cached in the rnode in struct vattr form.
123 * There is a time associated with the cached attributes (r_attrtime)
124 * which tells whether the attributes are valid. The time is initialized
125 * to the difference between current time and the modify time of the vnode
126 * when new attributes are cached. This allows the attributes for
127 * files that have changed recently to be timed out sooner than for files
128 * that have not changed for a long time. There are minimum and maximum
129 * timeout values that can be set per mount point.
130 */
131
132 int
133 nfs_waitfor_purge_complete(vnode_t *vp)
134 {
135 rnode_t *rp;
136 k_sigset_t smask;
137
138 rp = VTOR(vp);
139 if (rp->r_serial != NULL && rp->r_serial != curthread) {
140 mutex_enter(&rp->r_statelock);
141 sigintr(&smask, VTOMI(vp)->mi_flags & MI_INT);
142 while (rp->r_serial != NULL) {
143 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
144 sigunintr(&smask);
145 mutex_exit(&rp->r_statelock);
146 return (EINTR);
147 }
148 }
149 sigunintr(&smask);
150 mutex_exit(&rp->r_statelock);
151 }
152 return (0);
153 }
154
155 /*
156 * Validate caches by checking cached attributes. If the cached
157 * attributes have timed out, then get new attributes from the server.
158 * As a side affect, this will do cache invalidation if the attributes
159 * have changed.
160 *
161 * If the attributes have not timed out and if there is a cache
162 * invalidation being done by some other thread, then wait until that
163 * thread has completed the cache invalidation.
164 */
165 int
166 nfs_validate_caches(vnode_t *vp, cred_t *cr)
167 {
168 int error;
169 struct vattr va;
170
171 if (ATTRCACHE_VALID(vp)) {
172 error = nfs_waitfor_purge_complete(vp);
173 if (error)
174 return (error);
175 return (0);
176 }
177
178 va.va_mask = AT_ALL;
179 return (nfs_getattr_otw(vp, &va, cr));
180 }
181
182 /*
183 * Validate caches by checking cached attributes. If the cached
184 * attributes have timed out, then get new attributes from the server.
185 * As a side affect, this will do cache invalidation if the attributes
186 * have changed.
187 *
188 * If the attributes have not timed out and if there is a cache
189 * invalidation being done by some other thread, then wait until that
190 * thread has completed the cache invalidation.
191 */
192 int
193 nfs3_validate_caches(vnode_t *vp, cred_t *cr)
194 {
195 int error;
196 struct vattr va;
197
198 if (ATTRCACHE_VALID(vp)) {
199 error = nfs_waitfor_purge_complete(vp);
200 if (error)
201 return (error);
202 return (0);
203 }
204
205 va.va_mask = AT_ALL;
206 return (nfs3_getattr_otw(vp, &va, cr));
207 }
208
209 /*
210 * Purge all of the various NFS `data' caches.
211 */
212 void
213 nfs_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr)
214 {
215 rnode_t *rp;
216 char *contents;
217 int size;
218 int error;
219
220 /*
221 * Purge the DNLC for any entries which refer to this file.
222 * Avoid recursive entry into dnlc_purge_vp() in case of a directory.
223 */
224 rp = VTOR(vp);
225 mutex_enter(&rp->r_statelock);
226 if (vp->v_count > 1 &&
227 (vp->v_type == VDIR || purge_dnlc == NFS_PURGE_DNLC) &&
228 !(rp->r_flags & RINDNLCPURGE)) {
229 /*
230 * Set the RINDNLCPURGE flag to prevent recursive entry
231 * into dnlc_purge_vp()
232 */
233 if (vp->v_type == VDIR)
234 rp->r_flags |= RINDNLCPURGE;
235 mutex_exit(&rp->r_statelock);
236 dnlc_purge_vp(vp);
237 mutex_enter(&rp->r_statelock);
238 if (rp->r_flags & RINDNLCPURGE)
239 rp->r_flags &= ~RINDNLCPURGE;
240 }
241
242 /*
243 * Clear any readdir state bits and purge the readlink response cache.
244 */
245 contents = rp->r_symlink.contents;
246 size = rp->r_symlink.size;
247 rp->r_symlink.contents = NULL;
248 mutex_exit(&rp->r_statelock);
249
250 if (contents != NULL) {
251
252 kmem_free((void *)contents, size);
253 }
254
255 /*
256 * Flush the page cache.
257 */
258 if (vn_has_cached_data(vp)) {
259 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr);
260 if (error && (error == ENOSPC || error == EDQUOT)) {
261 mutex_enter(&rp->r_statelock);
262 if (!rp->r_error)
263 rp->r_error = error;
264 mutex_exit(&rp->r_statelock);
265 }
266 }
267
268 /*
269 * Flush the readdir response cache.
270 */
271 if (HAVE_RDDIR_CACHE(rp))
272 nfs_purge_rddir_cache(vp);
273 }
274
275 /*
276 * Purge the readdir cache of all entries
277 */
278 void
279 nfs_purge_rddir_cache(vnode_t *vp)
280 {
281 rnode_t *rp;
282 rddir_cache *rdc;
283 rddir_cache *nrdc;
284
285 rp = VTOR(vp);
286 top:
287 mutex_enter(&rp->r_statelock);
288 rp->r_direof = NULL;
289 rp->r_flags &= ~RLOOKUP;
290 rp->r_flags |= RREADDIRPLUS;
291 rdc = avl_first(&rp->r_dir);
292 while (rdc != NULL) {
293 nrdc = AVL_NEXT(&rp->r_dir, rdc);
294 avl_remove(&rp->r_dir, rdc);
295 rddir_cache_rele(rdc);
296 rdc = nrdc;
297 }
298 mutex_exit(&rp->r_statelock);
299 }
300
301 /*
302 * Do a cache check based on the post-operation attributes.
303 * Then make them the new cached attributes. If no attributes
304 * were returned, then mark the attributes as timed out.
305 */
306 void
307 nfs3_cache_post_op_attr(vnode_t *vp, post_op_attr *poap, hrtime_t t, cred_t *cr)
308 {
309 vattr_t attr;
310
311 if (!poap->attributes) {
312 PURGE_ATTRCACHE(vp);
313 return;
314 }
315 (void) nfs3_cache_fattr3(vp, &poap->attr, &attr, t, cr);
316 }
317
318 /*
319 * Same as above, but using a vattr
320 */
321 void
322 nfs3_cache_post_op_vattr(vnode_t *vp, post_op_vattr *poap, hrtime_t t,
323 cred_t *cr)
324 {
325 if (!poap->attributes) {
326 PURGE_ATTRCACHE(vp);
327 return;
328 }
329 nfs_attr_cache(vp, poap->fres.vap, t, cr);
330 }
331
332 /*
333 * Do a cache check based on the weak cache consistency attributes.
334 * These consist of a small set of pre-operation attributes and the
335 * full set of post-operation attributes.
336 *
337 * If we are given the pre-operation attributes, then use them to
338 * check the validity of the various caches. Then, if we got the
339 * post-operation attributes, make them the new cached attributes.
340 * If we didn't get the post-operation attributes, then mark the
341 * attribute cache as timed out so that the next reference will
342 * cause a GETATTR to the server to refresh with the current
343 * attributes.
344 *
345 * Otherwise, if we didn't get the pre-operation attributes, but
346 * we did get the post-operation attributes, then use these
347 * attributes to check the validity of the various caches. This
348 * will probably cause a flush of the caches because if the
349 * operation succeeded, the attributes of the object were changed
350 * in some way from the old post-operation attributes. This
351 * should be okay because it is the safe thing to do. After
352 * checking the data caches, then we make these the new cached
353 * attributes.
354 *
355 * Otherwise, we didn't get either the pre- or post-operation
356 * attributes. Simply mark the attribute cache as timed out so
357 * the next reference will cause a GETATTR to the server to
358 * refresh with the current attributes.
359 *
360 * If an error occurred trying to convert the over the wire
361 * attributes to a vattr, then simply mark the attribute cache as
362 * timed out.
363 */
364 void
365 nfs3_cache_wcc_data(vnode_t *vp, wcc_data *wccp, hrtime_t t, cred_t *cr)
366 {
367 vattr_t bva;
368 vattr_t ava;
369
370 if (wccp->after.attributes) {
371 if (fattr3_to_vattr(vp, &wccp->after.attr, &ava)) {
372 PURGE_ATTRCACHE(vp);
373 return;
374 }
375 if (wccp->before.attributes) {
376 bva.va_ctime.tv_sec = wccp->before.attr.ctime.seconds;
377 bva.va_ctime.tv_nsec = wccp->before.attr.ctime.nseconds;
378 bva.va_mtime.tv_sec = wccp->before.attr.mtime.seconds;
379 bva.va_mtime.tv_nsec = wccp->before.attr.mtime.nseconds;
380 bva.va_size = wccp->before.attr.size;
381 nfs3_attr_cache(vp, &bva, &ava, t, cr);
382 } else
383 nfs_attr_cache(vp, &ava, t, cr);
384 } else {
385 PURGE_ATTRCACHE(vp);
386 }
387 }
388
389 /*
390 * Set attributes cache for given vnode using nfsattr.
391 *
392 * This routine does not do cache validation with the attributes.
393 *
394 * If an error occurred trying to convert the over the wire
395 * attributes to a vattr, then simply mark the attribute cache as
396 * timed out.
397 */
398 void
399 nfs_attrcache(vnode_t *vp, struct nfsfattr *na, hrtime_t t)
400 {
401 rnode_t *rp;
402 struct vattr va;
403
404 if (!nattr_to_vattr(vp, na, &va)) {
405 rp = VTOR(vp);
406 mutex_enter(&rp->r_statelock);
407 if (rp->r_mtime <= t)
408 nfs_attrcache_va(vp, &va);
409 mutex_exit(&rp->r_statelock);
410 } else {
411 PURGE_ATTRCACHE(vp);
412 }
413 }
414
415 /*
416 * Set attributes cache for given vnode using fattr3.
417 *
418 * This routine does not do cache validation with the attributes.
419 *
420 * If an error occurred trying to convert the over the wire
421 * attributes to a vattr, then simply mark the attribute cache as
422 * timed out.
423 */
424 void
425 nfs3_attrcache(vnode_t *vp, fattr3 *na, hrtime_t t)
426 {
427 rnode_t *rp;
428 struct vattr va;
429
430 if (!fattr3_to_vattr(vp, na, &va)) {
431 rp = VTOR(vp);
432 mutex_enter(&rp->r_statelock);
433 if (rp->r_mtime <= t)
434 nfs_attrcache_va(vp, &va);
435 mutex_exit(&rp->r_statelock);
436 } else {
437 PURGE_ATTRCACHE(vp);
438 }
439 }
440
441 /*
442 * Do a cache check based on attributes returned over the wire. The
443 * new attributes are cached.
444 *
445 * If an error occurred trying to convert the over the wire attributes
446 * to a vattr, then just return that error.
447 *
448 * As a side affect, the vattr argument is filled in with the converted
449 * attributes.
450 */
451 int
452 nfs_cache_fattr(vnode_t *vp, struct nfsfattr *na, vattr_t *vap, hrtime_t t,
453 cred_t *cr)
454 {
455 int error;
456
457 error = nattr_to_vattr(vp, na, vap);
458 if (error)
459 return (error);
460 nfs_attr_cache(vp, vap, t, cr);
461 return (0);
462 }
463
464 /*
465 * Do a cache check based on attributes returned over the wire. The
466 * new attributes are cached.
467 *
468 * If an error occurred trying to convert the over the wire attributes
469 * to a vattr, then just return that error.
470 *
471 * As a side affect, the vattr argument is filled in with the converted
472 * attributes.
473 */
474 int
475 nfs3_cache_fattr3(vnode_t *vp, fattr3 *na, vattr_t *vap, hrtime_t t, cred_t *cr)
476 {
477 int error;
478
479 error = fattr3_to_vattr(vp, na, vap);
480 if (error)
481 return (error);
482 nfs_attr_cache(vp, vap, t, cr);
483 return (0);
484 }
485
486 /*
487 * Use the passed in virtual attributes to check to see whether the
488 * data and metadata caches are valid, cache the new attributes, and
489 * then do the cache invalidation if required.
490 *
491 * The cache validation and caching of the new attributes is done
492 * atomically via the use of the mutex, r_statelock. If required,
493 * the cache invalidation is done atomically w.r.t. the cache
494 * validation and caching of the attributes via the pseudo lock,
495 * r_serial.
496 *
497 * This routine is used to do cache validation and attributes caching
498 * for operations with a single set of post operation attributes.
499 */
500 void
501 nfs_attr_cache(vnode_t *vp, vattr_t *vap, hrtime_t t, cred_t *cr)
502 {
503 rnode_t *rp;
504 int mtime_changed;
505 int ctime_changed;
506 vsecattr_t *vsp;
507 int was_serial;
508
509 rp = VTOR(vp);
510
511 mutex_enter(&rp->r_statelock);
512
513 if (rp->r_serial != curthread) {
514 klwp_t *lwp = ttolwp(curthread);
515
516 was_serial = 0;
517 if (lwp != NULL)
518 lwp->lwp_nostop++;
519 while (rp->r_serial != NULL) {
520 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
521 mutex_exit(&rp->r_statelock);
522 if (lwp != NULL)
523 lwp->lwp_nostop--;
524 return;
525 }
526 }
527 if (lwp != NULL)
528 lwp->lwp_nostop--;
529 } else
530 was_serial = 1;
531
532 if (rp->r_mtime > t) {
533 mutex_exit(&rp->r_statelock);
534 return;
535 }
536
537 if (!(rp->r_flags & RWRITEATTR)) {
538 if (!CACHE_VALID(rp, vap->va_mtime, vap->va_size))
539 mtime_changed = 1;
540 else
541 mtime_changed = 0;
542 if (rp->r_attr.va_ctime.tv_sec != vap->va_ctime.tv_sec ||
543 rp->r_attr.va_ctime.tv_nsec != vap->va_ctime.tv_nsec)
544 ctime_changed = 1;
545 else
546 ctime_changed = 0;
547 } else if (rp->r_size != vap->va_size &&
548 (!vn_has_cached_data(vp) ||
549 (!(rp->r_flags & RDIRTY) && rp->r_count == 0))) {
550 mtime_changed = 1;
551 ctime_changed = 0;
552 } else {
553 mtime_changed = 0;
554 ctime_changed = 0;
555 }
556
557 nfs_attrcache_va(vp, vap);
558
559 if (!mtime_changed && !ctime_changed) {
560 mutex_exit(&rp->r_statelock);
561 return;
562 }
563
564 rp->r_serial = curthread;
565
566 mutex_exit(&rp->r_statelock);
567
568 if (mtime_changed)
569 nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr);
570
571 if (ctime_changed) {
572 (void) nfs_access_purge_rp(rp);
573 if (rp->r_secattr != NULL) {
574 mutex_enter(&rp->r_statelock);
575 vsp = rp->r_secattr;
576 rp->r_secattr = NULL;
577 mutex_exit(&rp->r_statelock);
578 if (vsp != NULL)
579 nfs_acl_free(vsp);
580 }
581 }
582
583 if (!was_serial) {
584 mutex_enter(&rp->r_statelock);
585 rp->r_serial = NULL;
586 cv_broadcast(&rp->r_cv);
587 mutex_exit(&rp->r_statelock);
588 }
589 }
590
591 /*
592 * Use the passed in "before" virtual attributes to check to see
593 * whether the data and metadata caches are valid, cache the "after"
594 * new attributes, and then do the cache invalidation if required.
595 *
596 * The cache validation and caching of the new attributes is done
597 * atomically via the use of the mutex, r_statelock. If required,
598 * the cache invalidation is done atomically w.r.t. the cache
599 * validation and caching of the attributes via the pseudo lock,
600 * r_serial.
601 *
602 * This routine is used to do cache validation and attributes caching
603 * for operations with both pre operation attributes and post operation
604 * attributes.
605 */
606 static void
607 nfs3_attr_cache(vnode_t *vp, vattr_t *bvap, vattr_t *avap, hrtime_t t,
608 cred_t *cr)
609 {
610 rnode_t *rp;
611 int mtime_changed;
612 int ctime_changed;
613 vsecattr_t *vsp;
614 int was_serial;
615
616 rp = VTOR(vp);
617
618 mutex_enter(&rp->r_statelock);
619
620 if (rp->r_serial != curthread) {
621 klwp_t *lwp = ttolwp(curthread);
622
623 was_serial = 0;
624 if (lwp != NULL)
625 lwp->lwp_nostop++;
626 while (rp->r_serial != NULL) {
627 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
628 mutex_exit(&rp->r_statelock);
629 if (lwp != NULL)
630 lwp->lwp_nostop--;
631 return;
632 }
633 }
634 if (lwp != NULL)
635 lwp->lwp_nostop--;
636 } else
637 was_serial = 1;
638
639 if (rp->r_mtime > t) {
640 mutex_exit(&rp->r_statelock);
641 return;
642 }
643
644 if (!(rp->r_flags & RWRITEATTR)) {
645 if (!CACHE_VALID(rp, bvap->va_mtime, bvap->va_size))
646 mtime_changed = 1;
647 else
648 mtime_changed = 0;
649 if (rp->r_attr.va_ctime.tv_sec != bvap->va_ctime.tv_sec ||
650 rp->r_attr.va_ctime.tv_nsec != bvap->va_ctime.tv_nsec)
651 ctime_changed = 1;
652 else
653 ctime_changed = 0;
654 } else {
655 mtime_changed = 0;
656 ctime_changed = 0;
657 }
658
659 nfs_attrcache_va(vp, avap);
660
661 if (!mtime_changed && !ctime_changed) {
662 mutex_exit(&rp->r_statelock);
663 return;
664 }
665
666 rp->r_serial = curthread;
667
668 mutex_exit(&rp->r_statelock);
669
670 if (mtime_changed)
671 nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr);
672
673 if (ctime_changed) {
674 (void) nfs_access_purge_rp(rp);
675 if (rp->r_secattr != NULL) {
676 mutex_enter(&rp->r_statelock);
677 vsp = rp->r_secattr;
678 rp->r_secattr = NULL;
679 mutex_exit(&rp->r_statelock);
680 if (vsp != NULL)
681 nfs_acl_free(vsp);
682 }
683 }
684
685 if (!was_serial) {
686 mutex_enter(&rp->r_statelock);
687 rp->r_serial = NULL;
688 cv_broadcast(&rp->r_cv);
689 mutex_exit(&rp->r_statelock);
690 }
691 }
692
693 /*
694 * Set attributes cache for given vnode using virtual attributes.
695 *
696 * Set the timeout value on the attribute cache and fill it
697 * with the passed in attributes.
698 *
699 * The caller must be holding r_statelock.
700 */
701 void
702 nfs_attrcache_va(vnode_t *vp, struct vattr *va)
703 {
704 rnode_t *rp;
705 mntinfo_t *mi;
706 hrtime_t delta;
707 hrtime_t now;
708
709 rp = VTOR(vp);
710
711 ASSERT(MUTEX_HELD(&rp->r_statelock));
712
713 now = gethrtime();
714
715 mi = VTOMI(vp);
716
717 /*
718 * Delta is the number of nanoseconds that we will
719 * cache the attributes of the file. It is based on
720 * the number of nanoseconds since the last time that
721 * we detected a change. The assumption is that files
722 * that changed recently are likely to change again.
723 * There is a minimum and a maximum for regular files
724 * and for directories which is enforced though.
725 *
726 * Using the time since last change was detected
727 * eliminates direct comparison or calculation
728 * using mixed client and server times. NFS does
729 * not make any assumptions regarding the client
730 * and server clocks being synchronized.
731 */
732 if (va->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec ||
733 va->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec ||
734 va->va_size != rp->r_attr.va_size)
735 rp->r_mtime = now;
736
737 if ((mi->mi_flags & MI_NOAC) || (vp->v_flag & VNOCACHE))
738 delta = 0;
739 else {
740 delta = now - rp->r_mtime;
741 if (vp->v_type == VDIR) {
742 if (delta < mi->mi_acdirmin)
743 delta = mi->mi_acdirmin;
744 else if (delta > mi->mi_acdirmax)
745 delta = mi->mi_acdirmax;
746 } else {
747 if (delta < mi->mi_acregmin)
748 delta = mi->mi_acregmin;
749 else if (delta > mi->mi_acregmax)
750 delta = mi->mi_acregmax;
751 }
752 }
753 rp->r_attrtime = now + delta;
754 rp->r_attr = *va;
755 /*
756 * Update the size of the file if there is no cached data or if
757 * the cached data is clean and there is no data being written
758 * out.
759 */
760 if (rp->r_size != va->va_size &&
761 (!vn_has_cached_data(vp) ||
762 (!(rp->r_flags & RDIRTY) && rp->r_count == 0)))
763 rp->r_size = va->va_size;
764 nfs_setswaplike(vp, va);
765 rp->r_flags &= ~RWRITEATTR;
766 }
767
768 /*
769 * Fill in attribute from the cache.
770 * If valid, then return 0 to indicate that no error occurred,
771 * otherwise return 1 to indicate that an error occurred.
772 */
773 static int
774 nfs_getattr_cache(vnode_t *vp, struct vattr *vap)
775 {
776 rnode_t *rp;
777
778 rp = VTOR(vp);
779 mutex_enter(&rp->r_statelock);
780 if (ATTRCACHE_VALID(vp)) {
781 /*
782 * Cached attributes are valid
783 */
784 *vap = rp->r_attr;
785 mutex_exit(&rp->r_statelock);
786 return (0);
787 }
788 mutex_exit(&rp->r_statelock);
789 return (1);
790 }
791
792 /*
793 * Get attributes over-the-wire and update attributes cache
794 * if no error occurred in the over-the-wire operation.
795 * Return 0 if successful, otherwise error.
796 */
797 int
798 nfs_getattr_otw(vnode_t *vp, struct vattr *vap, cred_t *cr)
799 {
800 int error;
801 struct nfsattrstat ns;
802 int douprintf;
803 mntinfo_t *mi;
804 failinfo_t fi;
805 hrtime_t t;
806
807 mi = VTOMI(vp);
808 fi.vp = vp;
809 fi.fhp = NULL; /* no need to update, filehandle not copied */
810 fi.copyproc = nfscopyfh;
811 fi.lookupproc = nfslookup;
812 fi.xattrdirproc = acl_getxattrdir2;
813
814 if (mi->mi_flags & MI_ACL) {
815 error = acl_getattr2_otw(vp, vap, cr);
816 if (mi->mi_flags & MI_ACL)
817 return (error);
818 }
819
820 douprintf = 1;
821
822 t = gethrtime();
823
824 error = rfs2call(mi, RFS_GETATTR,
825 xdr_fhandle, (caddr_t)VTOFH(vp),
826 xdr_attrstat, (caddr_t)&ns, cr,
827 &douprintf, &ns.ns_status, 0, &fi);
828
829 if (!error) {
830 error = geterrno(ns.ns_status);
831 if (!error)
832 error = nfs_cache_fattr(vp, &ns.ns_attr, vap, t, cr);
833 else {
834 PURGE_STALE_FH(error, vp, cr);
835 }
836 }
837
838 return (error);
839 }
840
841 /*
842 * Return either cached ot remote attributes. If get remote attr
843 * use them to check and invalidate caches, then cache the new attributes.
844 */
845 int
846 nfsgetattr(vnode_t *vp, struct vattr *vap, cred_t *cr)
847 {
848 int error;
849 rnode_t *rp;
850
851 /*
852 * If we've got cached attributes, we're done, otherwise go
853 * to the server to get attributes, which will update the cache
854 * in the process.
855 */
856 error = nfs_getattr_cache(vp, vap);
857 if (error)
858 error = nfs_getattr_otw(vp, vap, cr);
859
860 /* Return the client's view of file size */
861 rp = VTOR(vp);
862 mutex_enter(&rp->r_statelock);
863 vap->va_size = rp->r_size;
864 mutex_exit(&rp->r_statelock);
865
866 return (error);
867 }
868
869 /*
870 * Get attributes over-the-wire and update attributes cache
871 * if no error occurred in the over-the-wire operation.
872 * Return 0 if successful, otherwise error.
873 */
874 int
875 nfs3_getattr_otw(vnode_t *vp, struct vattr *vap, cred_t *cr)
876 {
877 int error;
878 GETATTR3args args;
879 GETATTR3vres res;
880 int douprintf;
881 failinfo_t fi;
882 hrtime_t t;
883
884 args.object = *VTOFH3(vp);
885 fi.vp = vp;
886 fi.fhp = (caddr_t)&args.object;
887 fi.copyproc = nfs3copyfh;
888 fi.lookupproc = nfs3lookup;
889 fi.xattrdirproc = acl_getxattrdir3;
890 res.fres.vp = vp;
891 res.fres.vap = vap;
892
893 douprintf = 1;
894
895 t = gethrtime();
896
897 error = rfs3call(VTOMI(vp), NFSPROC3_GETATTR,
898 xdr_nfs_fh3, (caddr_t)&args,
899 xdr_GETATTR3vres, (caddr_t)&res, cr,
900 &douprintf, &res.status, 0, &fi);
901
902 if (error)
903 return (error);
904
905 error = geterrno3(res.status);
906 if (error) {
907 PURGE_STALE_FH(error, vp, cr);
908 return (error);
909 }
910
911 /*
912 * Catch status codes that indicate fattr3 to vattr translation failure
913 */
914 if (res.fres.status)
915 return (res.fres.status);
916
917 nfs_attr_cache(vp, vap, t, cr);
918 return (0);
919 }
920
921 /*
922 * Return either cached or remote attributes. If get remote attr
923 * use them to check and invalidate caches, then cache the new attributes.
924 */
925 int
926 nfs3getattr(vnode_t *vp, struct vattr *vap, cred_t *cr)
927 {
928 int error;
929 rnode_t *rp;
930
931 /*
932 * If we've got cached attributes, we're done, otherwise go
933 * to the server to get attributes, which will update the cache
934 * in the process.
935 */
936 error = nfs_getattr_cache(vp, vap);
937 if (error)
938 error = nfs3_getattr_otw(vp, vap, cr);
939
940 /* Return the client's view of file size */
941 rp = VTOR(vp);
942 mutex_enter(&rp->r_statelock);
943 vap->va_size = rp->r_size;
944 mutex_exit(&rp->r_statelock);
945
946 return (error);
947 }
948
949 vtype_t nf_to_vt[] = {
950 VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK
951 };
952 /*
953 * Convert NFS Version 2 over the network attributes to the local
954 * virtual attributes. The mapping between the UID_NOBODY/GID_NOBODY
955 * network representation and the local representation is done here.
956 * Returns 0 for success, error if failed due to overflow.
957 */
958 int
959 nattr_to_vattr(vnode_t *vp, struct nfsfattr *na, struct vattr *vap)
960 {
961 /* overflow in time attributes? */
962 #ifndef _LP64
963 if (!NFS2_FATTR_TIME_OK(na))
964 return (EOVERFLOW);
965 #endif
966
967 if (na->na_type < NFNON || na->na_type > NFSOC)
968 vap->va_type = VBAD;
969 else
970 vap->va_type = nf_to_vt[na->na_type];
971 vap->va_mode = na->na_mode;
972 vap->va_uid = (na->na_uid == NFS_UID_NOBODY) ? UID_NOBODY : na->na_uid;
973 vap->va_gid = (na->na_gid == NFS_GID_NOBODY) ? GID_NOBODY : na->na_gid;
974 vap->va_fsid = vp->v_vfsp->vfs_dev;
975 vap->va_nodeid = na->na_nodeid;
976 vap->va_nlink = na->na_nlink;
977 vap->va_size = na->na_size; /* keep for cache validation */
978 /*
979 * nfs protocol defines times as unsigned so don't extend sign,
980 * unless sysadmin set nfs_allow_preepoch_time.
981 */
982 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, na->na_atime.tv_sec);
983 vap->va_atime.tv_nsec = (uint32_t)(na->na_atime.tv_usec * 1000);
984 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, na->na_mtime.tv_sec);
985 vap->va_mtime.tv_nsec = (uint32_t)(na->na_mtime.tv_usec * 1000);
986 NFS_TIME_T_CONVERT(vap->va_ctime.tv_sec, na->na_ctime.tv_sec);
987 vap->va_ctime.tv_nsec = (uint32_t)(na->na_ctime.tv_usec * 1000);
988 /*
989 * Shannon's law - uncompress the received dev_t
990 * if the top half of is zero indicating a response
991 * from an `older style' OS. Except for when it is a
992 * `new style' OS sending the maj device of zero,
993 * in which case the algorithm still works because the
994 * fact that it is a new style server
995 * is hidden by the minor device not being greater
996 * than 255 (a requirement in this case).
997 */
998 if ((na->na_rdev & 0xffff0000) == 0)
999 vap->va_rdev = nfsv2_expdev(na->na_rdev);
1000 else
1001 vap->va_rdev = expldev(na->na_rdev);
1002
1003 vap->va_nblocks = na->na_blocks;
1004 switch (na->na_type) {
1005 case NFBLK:
1006 vap->va_blksize = DEV_BSIZE;
1007 break;
1008
1009 case NFCHR:
1010 vap->va_blksize = MAXBSIZE;
1011 break;
1012
1013 case NFSOC:
1014 default:
1015 vap->va_blksize = na->na_blocksize;
1016 break;
1017 }
1018 /*
1019 * This bit of ugliness is a hack to preserve the
1020 * over-the-wire protocols for named-pipe vnodes.
1021 * It remaps the special over-the-wire type to the
1022 * VFIFO type. (see note in nfs.h)
1023 */
1024 if (NA_ISFIFO(na)) {
1025 vap->va_type = VFIFO;
1026 vap->va_mode = (vap->va_mode & ~S_IFMT) | S_IFIFO;
1027 vap->va_rdev = 0;
1028 vap->va_blksize = na->na_blocksize;
1029 }
1030 vap->va_seq = 0;
1031 return (0);
1032 }
1033
1034 /*
1035 * Convert NFS Version 3 over the network attributes to the local
1036 * virtual attributes. The mapping between the UID_NOBODY/GID_NOBODY
1037 * network representation and the local representation is done here.
1038 */
1039 vtype_t nf3_to_vt[] = {
1040 VBAD, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO
1041 };
1042
1043 int
1044 fattr3_to_vattr(vnode_t *vp, fattr3 *na, struct vattr *vap)
1045 {
1046
1047 #ifndef _LP64
1048 /* overflow in time attributes? */
1049 if (!NFS3_FATTR_TIME_OK(na))
1050 return (EOVERFLOW);
1051 #endif
1052 if (!NFS3_SIZE_OK(na->size))
1053 /* file too big */
1054 return (EFBIG);
1055
1056 vap->va_mask = AT_ALL;
1057
1058 if (na->type < NF3REG || na->type > NF3FIFO)
1059 vap->va_type = VBAD;
1060 else
1061 vap->va_type = nf3_to_vt[na->type];
1062 vap->va_mode = na->mode;
1063 vap->va_uid = (na->uid == NFS_UID_NOBODY) ? UID_NOBODY : (uid_t)na->uid;
1064 vap->va_gid = (na->gid == NFS_GID_NOBODY) ? GID_NOBODY : (gid_t)na->gid;
1065 vap->va_fsid = vp->v_vfsp->vfs_dev;
1066 vap->va_nodeid = na->fileid;
1067 vap->va_nlink = na->nlink;
1068 vap->va_size = na->size;
1069
1070 /*
1071 * nfs protocol defines times as unsigned so don't extend sign,
1072 * unless sysadmin set nfs_allow_preepoch_time.
1073 */
1074 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, na->atime.seconds);
1075 vap->va_atime.tv_nsec = (uint32_t)na->atime.nseconds;
1076 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, na->mtime.seconds);
1077 vap->va_mtime.tv_nsec = (uint32_t)na->mtime.nseconds;
1078 NFS_TIME_T_CONVERT(vap->va_ctime.tv_sec, na->ctime.seconds);
1079 vap->va_ctime.tv_nsec = (uint32_t)na->ctime.nseconds;
1080
1081 switch (na->type) {
1082 case NF3BLK:
1083 vap->va_rdev = makedevice(na->rdev.specdata1,
1084 na->rdev.specdata2);
1085 vap->va_blksize = DEV_BSIZE;
1086 vap->va_nblocks = 0;
1087 break;
1088 case NF3CHR:
1089 vap->va_rdev = makedevice(na->rdev.specdata1,
1090 na->rdev.specdata2);
1091 vap->va_blksize = MAXBSIZE;
1092 vap->va_nblocks = 0;
1093 break;
1094 case NF3REG:
1095 case NF3DIR:
1096 case NF3LNK:
1097 vap->va_rdev = 0;
1098 vap->va_blksize = MAXBSIZE;
1099 vap->va_nblocks = (u_longlong_t)
1100 ((na->used + (size3)DEV_BSIZE - (size3)1) /
1101 (size3)DEV_BSIZE);
1102 break;
1103 case NF3SOCK:
1104 case NF3FIFO:
1105 default:
1106 vap->va_rdev = 0;
1107 vap->va_blksize = MAXBSIZE;
1108 vap->va_nblocks = 0;
1109 break;
1110 }
1111 vap->va_seq = 0;
1112 return (0);
1113 }
1114
1115 /*
1116 * Asynchronous I/O parameters. nfs_async_threads is the high-water mark
1117 * for the demand-based allocation of async threads per-mount. The
1118 * nfs_async_timeout is the amount of time a thread will live after it
1119 * becomes idle, unless new I/O requests are received before the thread
1120 * dies. See nfs_async_putpage and nfs_async_start.
1121 */
1122
1123 int nfs_async_timeout = -1; /* uninitialized */
1124
1125 static void nfs_async_start(struct vfs *);
1126
1127 static void
1128 free_async_args(struct nfs_async_reqs *args)
1129 {
1130 rnode_t *rp;
1131
1132 if (args->a_io != NFS_INACTIVE) {
1133 rp = VTOR(args->a_vp);
1134 mutex_enter(&rp->r_statelock);
1135 rp->r_count--;
1136 if (args->a_io == NFS_PUTAPAGE ||
1137 args->a_io == NFS_PAGEIO)
1138 rp->r_awcount--;
1139 cv_broadcast(&rp->r_cv);
1140 mutex_exit(&rp->r_statelock);
1141 VN_RELE(args->a_vp);
1142 }
1143 crfree(args->a_cred);
1144 kmem_free(args, sizeof (*args));
1145 }
1146
1147 /*
1148 * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and
1149 * pageout(), running in the global zone, have legitimate reasons to do
1150 * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts. We avoid the problem by
1151 * use of a a per-mount "asynchronous requests manager thread" which is
1152 * signaled by the various asynchronous work routines when there is
1153 * asynchronous work to be done. It is responsible for creating new
1154 * worker threads if necessary, and notifying existing worker threads
1155 * that there is work to be done.
1156 *
1157 * In other words, it will "take the specifications from the customers and
1158 * give them to the engineers."
1159 *
1160 * Worker threads die off of their own accord if they are no longer
1161 * needed.
1162 *
1163 * This thread is killed when the zone is going away or the filesystem
1164 * is being unmounted.
1165 */
1166 void
1167 nfs_async_manager(vfs_t *vfsp)
1168 {
1169 callb_cpr_t cprinfo;
1170 mntinfo_t *mi;
1171 uint_t max_threads;
1172
1173 mi = VFTOMI(vfsp);
1174
1175 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
1176 "nfs_async_manager");
1177
1178 mutex_enter(&mi->mi_async_lock);
1179 /*
1180 * We want to stash the max number of threads that this mount was
1181 * allowed so we can use it later when the variable is set to zero as
1182 * part of the zone/mount going away.
1183 *
1184 * We want to be able to create at least one thread to handle
1185 * asyncrhonous inactive calls.
1186 */
1187 max_threads = MAX(mi->mi_max_threads, 1);
1188 mutex_enter(&mi->mi_lock);
1189 /*
1190 * We don't want to wait for mi_max_threads to go to zero, since that
1191 * happens as part of a failed unmount, but this thread should only
1192 * exit when the mount/zone is really going away.
1193 *
1194 * Once MI_ASYNC_MGR_STOP is set, no more async operations will be
1195 * attempted: the various _async_*() functions know to do things
1196 * inline if mi_max_threads == 0. Henceforth we just drain out the
1197 * outstanding requests.
1198 *
1199 * Note that we still create zthreads even if we notice the zone is
1200 * shutting down (MI_ASYNC_MGR_STOP is set); this may cause the zone
1201 * shutdown sequence to take slightly longer in some cases, but
1202 * doesn't violate the protocol, as all threads will exit as soon as
1203 * they're done processing the remaining requests.
1204 */
1205 while (!(mi->mi_flags & MI_ASYNC_MGR_STOP) ||
1206 mi->mi_async_req_count > 0) {
1207 mutex_exit(&mi->mi_lock);
1208 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1209 cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock);
1210 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1211 while (mi->mi_async_req_count > 0) {
1212 /*
1213 * Paranoia: If the mount started out having
1214 * (mi->mi_max_threads == 0), and the value was
1215 * later changed (via a debugger or somesuch),
1216 * we could be confused since we will think we
1217 * can't create any threads, and the calling
1218 * code (which looks at the current value of
1219 * mi->mi_max_threads, now non-zero) thinks we
1220 * can.
1221 *
1222 * So, because we're paranoid, we create threads
1223 * up to the maximum of the original and the
1224 * current value. This means that future
1225 * (debugger-induced) lowerings of
1226 * mi->mi_max_threads are ignored for our
1227 * purposes, but who told them they could change
1228 * random values on a live kernel anyhow?
1229 */
1230 if (mi->mi_threads <
1231 MAX(mi->mi_max_threads, max_threads)) {
1232 mi->mi_threads++;
1233 mutex_exit(&mi->mi_async_lock);
1234 VFS_HOLD(vfsp); /* hold for new thread */
1235 (void) zthread_create(NULL, 0, nfs_async_start,
1236 vfsp, 0, minclsyspri);
1237 mutex_enter(&mi->mi_async_lock);
1238 }
1239 cv_signal(&mi->mi_async_work_cv);
1240 ASSERT(mi->mi_async_req_count != 0);
1241 mi->mi_async_req_count--;
1242 }
1243 mutex_enter(&mi->mi_lock);
1244 }
1245 mutex_exit(&mi->mi_lock);
1246 /*
1247 * Let everyone know we're done.
1248 */
1249 mi->mi_manager_thread = NULL;
1250 cv_broadcast(&mi->mi_async_cv);
1251
1252 /*
1253 * There is no explicit call to mutex_exit(&mi->mi_async_lock)
1254 * since CALLB_CPR_EXIT is actually responsible for releasing
1255 * 'mi_async_lock'.
1256 */
1257 CALLB_CPR_EXIT(&cprinfo);
1258 VFS_RELE(vfsp); /* release thread's hold */
1259 zthread_exit();
1260 }
1261
1262 /*
1263 * Signal (and wait for) the async manager thread to clean up and go away.
1264 */
1265 void
1266 nfs_async_manager_stop(vfs_t *vfsp)
1267 {
1268 mntinfo_t *mi = VFTOMI(vfsp);
1269
1270 mutex_enter(&mi->mi_async_lock);
1271 mutex_enter(&mi->mi_lock);
1272 mi->mi_flags |= MI_ASYNC_MGR_STOP;
1273 mutex_exit(&mi->mi_lock);
1274 cv_broadcast(&mi->mi_async_reqs_cv);
1275 while (mi->mi_manager_thread != NULL)
1276 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1277 mutex_exit(&mi->mi_async_lock);
1278 }
1279
1280 int
1281 nfs_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr,
1282 struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *,
1283 u_offset_t, caddr_t, struct seg *, cred_t *))
1284 {
1285 rnode_t *rp;
1286 mntinfo_t *mi;
1287 struct nfs_async_reqs *args;
1288
1289 rp = VTOR(vp);
1290 ASSERT(rp->r_freef == NULL);
1291
1292 mi = VTOMI(vp);
1293
1294 /*
1295 * If addr falls in a different segment, don't bother doing readahead.
1296 */
1297 if (addr >= seg->s_base + seg->s_size)
1298 return (-1);
1299
1300 /*
1301 * If we can't allocate a request structure, punt on the readahead.
1302 */
1303 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1304 return (-1);
1305
1306 /*
1307 * If a lock operation is pending, don't initiate any new
1308 * readaheads. Otherwise, bump r_count to indicate the new
1309 * asynchronous I/O.
1310 */
1311 if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) {
1312 kmem_free(args, sizeof (*args));
1313 return (-1);
1314 }
1315 mutex_enter(&rp->r_statelock);
1316 rp->r_count++;
1317 mutex_exit(&rp->r_statelock);
1318 nfs_rw_exit(&rp->r_lkserlock);
1319
1320 args->a_next = NULL;
1321 #ifdef DEBUG
1322 args->a_queuer = curthread;
1323 #endif
1324 VN_HOLD(vp);
1325 args->a_vp = vp;
1326 ASSERT(cr != NULL);
1327 crhold(cr);
1328 args->a_cred = cr;
1329 args->a_io = NFS_READ_AHEAD;
1330 args->a_nfs_readahead = readahead;
1331 args->a_nfs_blkoff = blkoff;
1332 args->a_nfs_seg = seg;
1333 args->a_nfs_addr = addr;
1334
1335 mutex_enter(&mi->mi_async_lock);
1336
1337 /*
1338 * If asyncio has been disabled, don't bother readahead.
1339 */
1340 if (mi->mi_max_threads == 0) {
1341 mutex_exit(&mi->mi_async_lock);
1342 goto noasync;
1343 }
1344
1345 /*
1346 * Link request structure into the async list and
1347 * wakeup async thread to do the i/o.
1348 */
1349 if (mi->mi_async_reqs[NFS_READ_AHEAD] == NULL) {
1350 mi->mi_async_reqs[NFS_READ_AHEAD] = args;
1351 mi->mi_async_tail[NFS_READ_AHEAD] = args;
1352 } else {
1353 mi->mi_async_tail[NFS_READ_AHEAD]->a_next = args;
1354 mi->mi_async_tail[NFS_READ_AHEAD] = args;
1355 }
1356
1357 if (mi->mi_io_kstats) {
1358 mutex_enter(&mi->mi_lock);
1359 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1360 mutex_exit(&mi->mi_lock);
1361 }
1362
1363 mi->mi_async_req_count++;
1364 ASSERT(mi->mi_async_req_count != 0);
1365 cv_signal(&mi->mi_async_reqs_cv);
1366 mutex_exit(&mi->mi_async_lock);
1367 return (0);
1368
1369 noasync:
1370 mutex_enter(&rp->r_statelock);
1371 rp->r_count--;
1372 cv_broadcast(&rp->r_cv);
1373 mutex_exit(&rp->r_statelock);
1374 VN_RELE(vp);
1375 crfree(cr);
1376 kmem_free(args, sizeof (*args));
1377 return (-1);
1378 }
1379
1380 int
1381 nfs_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
1382 int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *,
1383 u_offset_t, size_t, int, cred_t *))
1384 {
1385 rnode_t *rp;
1386 mntinfo_t *mi;
1387 struct nfs_async_reqs *args;
1388
1389 ASSERT(flags & B_ASYNC);
1390 ASSERT(vp->v_vfsp != NULL);
1391
1392 rp = VTOR(vp);
1393 ASSERT(rp->r_count > 0);
1394
1395 mi = VTOMI(vp);
1396
1397 /*
1398 * If we can't allocate a request structure, do the putpage
1399 * operation synchronously in this thread's context.
1400 */
1401 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1402 goto noasync;
1403
1404 args->a_next = NULL;
1405 #ifdef DEBUG
1406 args->a_queuer = curthread;
1407 #endif
1408 VN_HOLD(vp);
1409 args->a_vp = vp;
1410 ASSERT(cr != NULL);
1411 crhold(cr);
1412 args->a_cred = cr;
1413 args->a_io = NFS_PUTAPAGE;
1414 args->a_nfs_putapage = putapage;
1415 args->a_nfs_pp = pp;
1416 args->a_nfs_off = off;
1417 args->a_nfs_len = (uint_t)len;
1418 args->a_nfs_flags = flags;
1419
1420 mutex_enter(&mi->mi_async_lock);
1421
1422 /*
1423 * If asyncio has been disabled, then make a synchronous request.
1424 * This check is done a second time in case async io was diabled
1425 * while this thread was blocked waiting for memory pressure to
1426 * reduce or for the queue to drain.
1427 */
1428 if (mi->mi_max_threads == 0) {
1429 mutex_exit(&mi->mi_async_lock);
1430 goto noasync;
1431 }
1432
1433 /*
1434 * Link request structure into the async list and
1435 * wakeup async thread to do the i/o.
1436 */
1437 if (mi->mi_async_reqs[NFS_PUTAPAGE] == NULL) {
1438 mi->mi_async_reqs[NFS_PUTAPAGE] = args;
1439 mi->mi_async_tail[NFS_PUTAPAGE] = args;
1440 } else {
1441 mi->mi_async_tail[NFS_PUTAPAGE]->a_next = args;
1442 mi->mi_async_tail[NFS_PUTAPAGE] = args;
1443 }
1444
1445 mutex_enter(&rp->r_statelock);
1446 rp->r_count++;
1447 rp->r_awcount++;
1448 mutex_exit(&rp->r_statelock);
1449
1450 if (mi->mi_io_kstats) {
1451 mutex_enter(&mi->mi_lock);
1452 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1453 mutex_exit(&mi->mi_lock);
1454 }
1455
1456 mi->mi_async_req_count++;
1457 ASSERT(mi->mi_async_req_count != 0);
1458 cv_signal(&mi->mi_async_reqs_cv);
1459 mutex_exit(&mi->mi_async_lock);
1460 return (0);
1461
1462 noasync:
1463 if (args != NULL) {
1464 VN_RELE(vp);
1465 crfree(cr);
1466 kmem_free(args, sizeof (*args));
1467 }
1468
1469 if (curproc == proc_pageout || curproc == proc_fsflush) {
1470 /*
1471 * If we get here in the context of the pageout/fsflush,
1472 * we refuse to do a sync write, because this may hang
1473 * pageout (and the machine). In this case, we just
1474 * re-mark the page as dirty and punt on the page.
1475 *
1476 * Make sure B_FORCE isn't set. We can re-mark the
1477 * pages as dirty and unlock the pages in one swoop by
1478 * passing in B_ERROR to pvn_write_done(). However,
1479 * we should make sure B_FORCE isn't set - we don't
1480 * want the page tossed before it gets written out.
1481 */
1482 if (flags & B_FORCE)
1483 flags &= ~(B_INVAL | B_FORCE);
1484 pvn_write_done(pp, flags | B_ERROR);
1485 return (0);
1486 }
1487 if (nfs_zone() != mi->mi_zone) {
1488 /*
1489 * So this was a cross-zone sync putpage. We pass in B_ERROR
1490 * to pvn_write_done() to re-mark the pages as dirty and unlock
1491 * them.
1492 *
1493 * We don't want to clear B_FORCE here as the caller presumably
1494 * knows what they're doing if they set it.
1495 */
1496 pvn_write_done(pp, flags | B_ERROR);
1497 return (EPERM);
1498 }
1499 return ((*putapage)(vp, pp, off, len, flags, cr));
1500 }
1501
1502 int
1503 nfs_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
1504 int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t,
1505 size_t, int, cred_t *))
1506 {
1507 rnode_t *rp;
1508 mntinfo_t *mi;
1509 struct nfs_async_reqs *args;
1510
1511 ASSERT(flags & B_ASYNC);
1512 ASSERT(vp->v_vfsp != NULL);
1513
1514 rp = VTOR(vp);
1515 ASSERT(rp->r_count > 0);
1516
1517 mi = VTOMI(vp);
1518
1519 /*
1520 * If we can't allocate a request structure, do the pageio
1521 * request synchronously in this thread's context.
1522 */
1523 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1524 goto noasync;
1525
1526 args->a_next = NULL;
1527 #ifdef DEBUG
1528 args->a_queuer = curthread;
1529 #endif
1530 VN_HOLD(vp);
1531 args->a_vp = vp;
1532 ASSERT(cr != NULL);
1533 crhold(cr);
1534 args->a_cred = cr;
1535 args->a_io = NFS_PAGEIO;
1536 args->a_nfs_pageio = pageio;
1537 args->a_nfs_pp = pp;
1538 args->a_nfs_off = io_off;
1539 args->a_nfs_len = (uint_t)io_len;
1540 args->a_nfs_flags = flags;
1541
1542 mutex_enter(&mi->mi_async_lock);
1543
1544 /*
1545 * If asyncio has been disabled, then make a synchronous request.
1546 * This check is done a second time in case async io was diabled
1547 * while this thread was blocked waiting for memory pressure to
1548 * reduce or for the queue to drain.
1549 */
1550 if (mi->mi_max_threads == 0) {
1551 mutex_exit(&mi->mi_async_lock);
1552 goto noasync;
1553 }
1554
1555 /*
1556 * Link request structure into the async list and
1557 * wakeup async thread to do the i/o.
1558 */
1559 if (mi->mi_async_reqs[NFS_PAGEIO] == NULL) {
1560 mi->mi_async_reqs[NFS_PAGEIO] = args;
1561 mi->mi_async_tail[NFS_PAGEIO] = args;
1562 } else {
1563 mi->mi_async_tail[NFS_PAGEIO]->a_next = args;
1564 mi->mi_async_tail[NFS_PAGEIO] = args;
1565 }
1566
1567 mutex_enter(&rp->r_statelock);
1568 rp->r_count++;
1569 rp->r_awcount++;
1570 mutex_exit(&rp->r_statelock);
1571
1572 if (mi->mi_io_kstats) {
1573 mutex_enter(&mi->mi_lock);
1574 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1575 mutex_exit(&mi->mi_lock);
1576 }
1577
1578 mi->mi_async_req_count++;
1579 ASSERT(mi->mi_async_req_count != 0);
1580 cv_signal(&mi->mi_async_reqs_cv);
1581 mutex_exit(&mi->mi_async_lock);
1582 return (0);
1583
1584 noasync:
1585 if (args != NULL) {
1586 VN_RELE(vp);
1587 crfree(cr);
1588 kmem_free(args, sizeof (*args));
1589 }
1590
1591 /*
1592 * If we can't do it ASYNC, for reads we do nothing (but cleanup
1593 * the page list), for writes we do it synchronously, except for
1594 * proc_pageout/proc_fsflush as described below.
1595 */
1596 if (flags & B_READ) {
1597 pvn_read_done(pp, flags | B_ERROR);
1598 return (0);
1599 }
1600
1601 if (curproc == proc_pageout || curproc == proc_fsflush) {
1602 /*
1603 * If we get here in the context of the pageout/fsflush,
1604 * we refuse to do a sync write, because this may hang
1605 * pageout/fsflush (and the machine). In this case, we just
1606 * re-mark the page as dirty and punt on the page.
1607 *
1608 * Make sure B_FORCE isn't set. We can re-mark the
1609 * pages as dirty and unlock the pages in one swoop by
1610 * passing in B_ERROR to pvn_write_done(). However,
1611 * we should make sure B_FORCE isn't set - we don't
1612 * want the page tossed before it gets written out.
1613 */
1614 if (flags & B_FORCE)
1615 flags &= ~(B_INVAL | B_FORCE);
1616 pvn_write_done(pp, flags | B_ERROR);
1617 return (0);
1618 }
1619
1620 if (nfs_zone() != mi->mi_zone) {
1621 /*
1622 * So this was a cross-zone sync pageio. We pass in B_ERROR
1623 * to pvn_write_done() to re-mark the pages as dirty and unlock
1624 * them.
1625 *
1626 * We don't want to clear B_FORCE here as the caller presumably
1627 * knows what they're doing if they set it.
1628 */
1629 pvn_write_done(pp, flags | B_ERROR);
1630 return (EPERM);
1631 }
1632 return ((*pageio)(vp, pp, io_off, io_len, flags, cr));
1633 }
1634
1635 void
1636 nfs_async_readdir(vnode_t *vp, rddir_cache *rdc, cred_t *cr,
1637 int (*readdir)(vnode_t *, rddir_cache *, cred_t *))
1638 {
1639 rnode_t *rp;
1640 mntinfo_t *mi;
1641 struct nfs_async_reqs *args;
1642
1643 rp = VTOR(vp);
1644 ASSERT(rp->r_freef == NULL);
1645
1646 mi = VTOMI(vp);
1647
1648 /*
1649 * If we can't allocate a request structure, do the readdir
1650 * operation synchronously in this thread's context.
1651 */
1652 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1653 goto noasync;
1654
1655 args->a_next = NULL;
1656 #ifdef DEBUG
1657 args->a_queuer = curthread;
1658 #endif
1659 VN_HOLD(vp);
1660 args->a_vp = vp;
1661 ASSERT(cr != NULL);
1662 crhold(cr);
1663 args->a_cred = cr;
1664 args->a_io = NFS_READDIR;
1665 args->a_nfs_readdir = readdir;
1666 args->a_nfs_rdc = rdc;
1667
1668 mutex_enter(&mi->mi_async_lock);
1669
1670 /*
1671 * If asyncio has been disabled, then make a synchronous request.
1672 */
1673 if (mi->mi_max_threads == 0) {
1674 mutex_exit(&mi->mi_async_lock);
1675 goto noasync;
1676 }
1677
1678 /*
1679 * Link request structure into the async list and
1680 * wakeup async thread to do the i/o.
1681 */
1682 if (mi->mi_async_reqs[NFS_READDIR] == NULL) {
1683 mi->mi_async_reqs[NFS_READDIR] = args;
1684 mi->mi_async_tail[NFS_READDIR] = args;
1685 } else {
1686 mi->mi_async_tail[NFS_READDIR]->a_next = args;
1687 mi->mi_async_tail[NFS_READDIR] = args;
1688 }
1689
1690 mutex_enter(&rp->r_statelock);
1691 rp->r_count++;
1692 mutex_exit(&rp->r_statelock);
1693
1694 if (mi->mi_io_kstats) {
1695 mutex_enter(&mi->mi_lock);
1696 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1697 mutex_exit(&mi->mi_lock);
1698 }
1699
1700 mi->mi_async_req_count++;
1701 ASSERT(mi->mi_async_req_count != 0);
1702 cv_signal(&mi->mi_async_reqs_cv);
1703 mutex_exit(&mi->mi_async_lock);
1704 return;
1705
1706 noasync:
1707 if (args != NULL) {
1708 VN_RELE(vp);
1709 crfree(cr);
1710 kmem_free(args, sizeof (*args));
1711 }
1712
1713 rdc->entries = NULL;
1714 mutex_enter(&rp->r_statelock);
1715 ASSERT(rdc->flags & RDDIR);
1716 rdc->flags &= ~RDDIR;
1717 rdc->flags |= RDDIRREQ;
1718 /*
1719 * Check the flag to see if RDDIRWAIT is set. If RDDIRWAIT
1720 * is set, wakeup the thread sleeping in cv_wait_sig().
1721 * The woken up thread will reset the flag to RDDIR and will
1722 * continue with the readdir opeartion.
1723 */
1724 if (rdc->flags & RDDIRWAIT) {
1725 rdc->flags &= ~RDDIRWAIT;
1726 cv_broadcast(&rdc->cv);
1727 }
1728 mutex_exit(&rp->r_statelock);
1729 rddir_cache_rele(rdc);
1730 }
1731
1732 void
1733 nfs_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
1734 cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3,
1735 cred_t *))
1736 {
1737 rnode_t *rp;
1738 mntinfo_t *mi;
1739 struct nfs_async_reqs *args;
1740 page_t *pp;
1741
1742 rp = VTOR(vp);
1743 mi = VTOMI(vp);
1744
1745 /*
1746 * If we can't allocate a request structure, do the commit
1747 * operation synchronously in this thread's context.
1748 */
1749 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1750 goto noasync;
1751
1752 args->a_next = NULL;
1753 #ifdef DEBUG
1754 args->a_queuer = curthread;
1755 #endif
1756 VN_HOLD(vp);
1757 args->a_vp = vp;
1758 ASSERT(cr != NULL);
1759 crhold(cr);
1760 args->a_cred = cr;
1761 args->a_io = NFS_COMMIT;
1762 args->a_nfs_commit = commit;
1763 args->a_nfs_plist = plist;
1764 args->a_nfs_offset = offset;
1765 args->a_nfs_count = count;
1766
1767 mutex_enter(&mi->mi_async_lock);
1768
1769 /*
1770 * If asyncio has been disabled, then make a synchronous request.
1771 * This check is done a second time in case async io was diabled
1772 * while this thread was blocked waiting for memory pressure to
1773 * reduce or for the queue to drain.
1774 */
1775 if (mi->mi_max_threads == 0) {
1776 mutex_exit(&mi->mi_async_lock);
1777 goto noasync;
1778 }
1779
1780 /*
1781 * Link request structure into the async list and
1782 * wakeup async thread to do the i/o.
1783 */
1784 if (mi->mi_async_reqs[NFS_COMMIT] == NULL) {
1785 mi->mi_async_reqs[NFS_COMMIT] = args;
1786 mi->mi_async_tail[NFS_COMMIT] = args;
1787 } else {
1788 mi->mi_async_tail[NFS_COMMIT]->a_next = args;
1789 mi->mi_async_tail[NFS_COMMIT] = args;
1790 }
1791
1792 mutex_enter(&rp->r_statelock);
1793 rp->r_count++;
1794 mutex_exit(&rp->r_statelock);
1795
1796 if (mi->mi_io_kstats) {
1797 mutex_enter(&mi->mi_lock);
1798 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1799 mutex_exit(&mi->mi_lock);
1800 }
1801
1802 mi->mi_async_req_count++;
1803 ASSERT(mi->mi_async_req_count != 0);
1804 cv_signal(&mi->mi_async_reqs_cv);
1805 mutex_exit(&mi->mi_async_lock);
1806 return;
1807
1808 noasync:
1809 if (args != NULL) {
1810 VN_RELE(vp);
1811 crfree(cr);
1812 kmem_free(args, sizeof (*args));
1813 }
1814
1815 if (curproc == proc_pageout || curproc == proc_fsflush ||
1816 nfs_zone() != mi->mi_zone) {
1817 while (plist != NULL) {
1818 pp = plist;
1819 page_sub(&plist, pp);
1820 pp->p_fsdata = C_COMMIT;
1821 page_unlock(pp);
1822 }
1823 return;
1824 }
1825 (*commit)(vp, plist, offset, count, cr);
1826 }
1827
1828 void
1829 nfs_async_inactive(vnode_t *vp, cred_t *cr,
1830 void (*inactive)(vnode_t *, cred_t *))
1831 {
1832 mntinfo_t *mi;
1833 struct nfs_async_reqs *args;
1834
1835 mi = VTOMI(vp);
1836
1837 args = kmem_alloc(sizeof (*args), KM_SLEEP);
1838 args->a_next = NULL;
1839 #ifdef DEBUG
1840 args->a_queuer = curthread;
1841 #endif
1842 args->a_vp = vp;
1843 ASSERT(cr != NULL);
1844 crhold(cr);
1845 args->a_cred = cr;
1846 args->a_io = NFS_INACTIVE;
1847 args->a_nfs_inactive = inactive;
1848
1849 /*
1850 * Note that we don't check mi->mi_max_threads here, since we
1851 * *need* to get rid of this vnode regardless of whether someone
1852 * set nfs3_max_threads/nfs_max_threads to zero in /etc/system.
1853 *
1854 * The manager thread knows about this and is willing to create
1855 * at least one thread to accomodate us.
1856 */
1857 mutex_enter(&mi->mi_async_lock);
1858 if (mi->mi_manager_thread == NULL) {
1859 rnode_t *rp = VTOR(vp);
1860
1861 mutex_exit(&mi->mi_async_lock);
1862 crfree(cr); /* drop our reference */
1863 kmem_free(args, sizeof (*args));
1864 /*
1865 * We can't do an over-the-wire call since we're in the wrong
1866 * zone, so we need to clean up state as best we can and then
1867 * throw away the vnode.
1868 */
1869 mutex_enter(&rp->r_statelock);
1870 if (rp->r_unldvp != NULL) {
1871 vnode_t *unldvp;
1872 char *unlname;
1873 cred_t *unlcred;
1874
1875 unldvp = rp->r_unldvp;
1876 rp->r_unldvp = NULL;
1877 unlname = rp->r_unlname;
1878 rp->r_unlname = NULL;
1879 unlcred = rp->r_unlcred;
1880 rp->r_unlcred = NULL;
1881 mutex_exit(&rp->r_statelock);
1882
1883 VN_RELE(unldvp);
1884 kmem_free(unlname, MAXNAMELEN);
1885 crfree(unlcred);
1886 } else {
1887 mutex_exit(&rp->r_statelock);
1888 }
1889 /*
1890 * No need to explicitly throw away any cached pages. The
1891 * eventual rinactive() will attempt a synchronous
1892 * VOP_PUTPAGE() which will immediately fail since the request
1893 * is coming from the wrong zone, and then will proceed to call
1894 * nfs_invalidate_pages() which will clean things up for us.
1895 */
1896 rp_addfree(VTOR(vp), cr);
1897 return;
1898 }
1899
1900 if (mi->mi_async_reqs[NFS_INACTIVE] == NULL) {
1901 mi->mi_async_reqs[NFS_INACTIVE] = args;
1902 } else {
1903 mi->mi_async_tail[NFS_INACTIVE]->a_next = args;
1904 }
1905 mi->mi_async_tail[NFS_INACTIVE] = args;
1906 /*
1907 * Don't increment r_count, since we're trying to get rid of the vnode.
1908 */
1909
1910 mi->mi_async_req_count++;
1911 ASSERT(mi->mi_async_req_count != 0);
1912 cv_signal(&mi->mi_async_reqs_cv);
1913 mutex_exit(&mi->mi_async_lock);
1914 }
1915
1916 /*
1917 * The async queues for each mounted file system are arranged as a
1918 * set of queues, one for each async i/o type. Requests are taken
1919 * from the queues in a round-robin fashion. A number of consecutive
1920 * requests are taken from each queue before moving on to the next
1921 * queue. This functionality may allow the NFS Version 2 server to do
1922 * write clustering, even if the client is mixing writes and reads
1923 * because it will take multiple write requests from the queue
1924 * before processing any of the other async i/o types.
1925 *
1926 * XXX The nfs_async_start thread is unsafe in the light of the present
1927 * model defined by cpr to suspend the system. Specifically over the
1928 * wire calls are cpr-unsafe. The thread should be reevaluated in
1929 * case of future updates to the cpr model.
1930 */
1931 static void
1932 nfs_async_start(struct vfs *vfsp)
1933 {
1934 struct nfs_async_reqs *args;
1935 mntinfo_t *mi = VFTOMI(vfsp);
1936 clock_t time_left = 1;
1937 callb_cpr_t cprinfo;
1938 int i;
1939
1940 /*
1941 * Dynamic initialization of nfs_async_timeout to allow nfs to be
1942 * built in an implementation independent manner.
1943 */
1944 if (nfs_async_timeout == -1)
1945 nfs_async_timeout = NFS_ASYNC_TIMEOUT;
1946
1947 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas");
1948
1949 mutex_enter(&mi->mi_async_lock);
1950 for (;;) {
1951 /*
1952 * Find the next queue containing an entry. We start
1953 * at the current queue pointer and then round robin
1954 * through all of them until we either find a non-empty
1955 * queue or have looked through all of them.
1956 */
1957 for (i = 0; i < NFS_ASYNC_TYPES; i++) {
1958 args = *mi->mi_async_curr;
1959 if (args != NULL)
1960 break;
1961 mi->mi_async_curr++;
1962 if (mi->mi_async_curr ==
1963 &mi->mi_async_reqs[NFS_ASYNC_TYPES])
1964 mi->mi_async_curr = &mi->mi_async_reqs[0];
1965 }
1966 /*
1967 * If we didn't find a entry, then block until woken up
1968 * again and then look through the queues again.
1969 */
1970 if (args == NULL) {
1971 /*
1972 * Exiting is considered to be safe for CPR as well
1973 */
1974 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1975
1976 /*
1977 * Wakeup thread waiting to unmount the file
1978 * system only if all async threads are inactive.
1979 *
1980 * If we've timed-out and there's nothing to do,
1981 * then get rid of this thread.
1982 */
1983 if (mi->mi_max_threads == 0 || time_left <= 0) {
1984 if (--mi->mi_threads == 0)
1985 cv_signal(&mi->mi_async_cv);
1986 CALLB_CPR_EXIT(&cprinfo);
1987 VFS_RELE(vfsp); /* release thread's hold */
1988 zthread_exit();
1989 /* NOTREACHED */
1990 }
1991 time_left = cv_timedwait(&mi->mi_async_work_cv,
1992 &mi->mi_async_lock, nfs_async_timeout + lbolt);
1993
1994 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1995
1996 continue;
1997 }
1998 time_left = 1;
1999
2000 /*
2001 * Remove the request from the async queue and then
2002 * update the current async request queue pointer. If
2003 * the current queue is empty or we have removed enough
2004 * consecutive entries from it, then reset the counter
2005 * for this queue and then move the current pointer to
2006 * the next queue.
2007 */
2008 *mi->mi_async_curr = args->a_next;
2009 if (*mi->mi_async_curr == NULL ||
2010 --mi->mi_async_clusters[args->a_io] == 0) {
2011 mi->mi_async_clusters[args->a_io] =
2012 mi->mi_async_init_clusters;
2013 mi->mi_async_curr++;
2014 if (mi->mi_async_curr ==
2015 &mi->mi_async_reqs[NFS_ASYNC_TYPES])
2016 mi->mi_async_curr = &mi->mi_async_reqs[0];
2017 }
2018
2019 if (args->a_io != NFS_INACTIVE && mi->mi_io_kstats) {
2020 mutex_enter(&mi->mi_lock);
2021 kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
2022 mutex_exit(&mi->mi_lock);
2023 }
2024
2025 mutex_exit(&mi->mi_async_lock);
2026
2027 /*
2028 * Obtain arguments from the async request structure.
2029 */
2030 if (args->a_io == NFS_READ_AHEAD && mi->mi_max_threads > 0) {
2031 (*args->a_nfs_readahead)(args->a_vp, args->a_nfs_blkoff,
2032 args->a_nfs_addr, args->a_nfs_seg,
2033 args->a_cred);
2034 } else if (args->a_io == NFS_PUTAPAGE) {
2035 (void) (*args->a_nfs_putapage)(args->a_vp,
2036 args->a_nfs_pp, args->a_nfs_off,
2037 args->a_nfs_len, args->a_nfs_flags,
2038 args->a_cred);
2039 } else if (args->a_io == NFS_PAGEIO) {
2040 (void) (*args->a_nfs_pageio)(args->a_vp,
2041 args->a_nfs_pp, args->a_nfs_off,
2042 args->a_nfs_len, args->a_nfs_flags,
2043 args->a_cred);
2044 } else if (args->a_io == NFS_READDIR) {
2045 (void) ((*args->a_nfs_readdir)(args->a_vp,
2046 args->a_nfs_rdc, args->a_cred));
2047 } else if (args->a_io == NFS_COMMIT) {
2048 (*args->a_nfs_commit)(args->a_vp, args->a_nfs_plist,
2049 args->a_nfs_offset, args->a_nfs_count,
2050 args->a_cred);
2051 } else if (args->a_io == NFS_INACTIVE) {
2052 (*args->a_nfs_inactive)(args->a_vp, args->a_cred);
2053 }
2054
2055 /*
2056 * Now, release the vnode and free the credentials
2057 * structure.
2058 */
2059 free_async_args(args);
2060 /*
2061 * Reacquire the mutex because it will be needed above.
2062 */
2063 mutex_enter(&mi->mi_async_lock);
2064 }
2065 }
2066
2067 void
2068 nfs_async_stop(struct vfs *vfsp)
2069 {
2070 mntinfo_t *mi = VFTOMI(vfsp);
2071
2072 /*
2073 * Wait for all outstanding async operations to complete and for the
2074 * worker threads to exit.
2075 */
2076 mutex_enter(&mi->mi_async_lock);
2077 mi->mi_max_threads = 0;
2078 cv_broadcast(&mi->mi_async_work_cv);
2079 while (mi->mi_threads != 0)
2080 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
2081 mutex_exit(&mi->mi_async_lock);
2082 }
2083
2084 /*
2085 * nfs_async_stop_sig:
2086 * Wait for all outstanding putpage operation to complete. If a signal
2087 * is deliver we will abort and return non-zero. If we can put all the
2088 * pages we will return 0. This routine is called from nfs_unmount and
2089 * nfs3_unmount to make these operations interruptable.
2090 */
2091 int
2092 nfs_async_stop_sig(struct vfs *vfsp)
2093 {
2094 mntinfo_t *mi = VFTOMI(vfsp);
2095 ushort_t omax;
2096 int rval;
2097
2098 /*
2099 * Wait for all outstanding async operations to complete and for the
2100 * worker threads to exit.
2101 */
2102 mutex_enter(&mi->mi_async_lock);
2103 omax = mi->mi_max_threads;
2104 mi->mi_max_threads = 0;
2105 /*
2106 * Tell all the worker threads to exit.
2107 */
2108 cv_broadcast(&mi->mi_async_work_cv);
2109 while (mi->mi_threads != 0) {
2110 if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock))
2111 break;
2112 }
2113 rval = (mi->mi_threads != 0); /* Interrupted */
2114 if (rval)
2115 mi->mi_max_threads = omax;
2116 mutex_exit(&mi->mi_async_lock);
2117
2118 return (rval);
2119 }
2120
2121 int
2122 writerp(rnode_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated)
2123 {
2124 int pagecreate;
2125 int n;
2126 int saved_n;
2127 caddr_t saved_base;
2128 u_offset_t offset;
2129 int error;
2130 int sm_error;
2131
2132 ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid);
2133 ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE);
2134 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER));
2135
2136 /*
2137 * Move bytes in at most PAGESIZE chunks. We must avoid
2138 * spanning pages in uiomove() because page faults may cause
2139 * the cache to be invalidated out from under us. The r_size is not
2140 * updated until after the uiomove. If we push the last page of a
2141 * file before r_size is correct, we will lose the data written past
2142 * the current (and invalid) r_size.
2143 */
2144 do {
2145 offset = uio->uio_loffset;
2146 pagecreate = 0;
2147
2148 /*
2149 * n is the number of bytes required to satisfy the request
2150 * or the number of bytes to fill out the page.
2151 */
2152 n = (int)MIN((PAGESIZE - ((uintptr_t)base & PAGEOFFSET)),
2153 tcount);
2154
2155 /*
2156 * Check to see if we can skip reading in the page
2157 * and just allocate the memory. We can do this
2158 * if we are going to rewrite the entire mapping
2159 * or if we are going to write to or beyond the current
2160 * end of file from the beginning of the mapping.
2161 *
2162 * The read of r_size is now protected by r_statelock.
2163 */
2164 mutex_enter(&rp->r_statelock);
2165 /*
2166 * When pgcreated is nonzero the caller has already done
2167 * a segmap_getmapflt with forcefault 0 and S_WRITE. With
2168 * segkpm this means we already have at least one page
2169 * created and mapped at base.
2170 */
2171 pagecreate = pgcreated ||
2172 (((uintptr_t)base & PAGEOFFSET) == 0 &&
2173 (n == PAGESIZE || ((offset + n) >= rp->r_size)));
2174
2175 mutex_exit(&rp->r_statelock);
2176 if (pagecreate) {
2177 /*
2178 * The last argument tells segmap_pagecreate() to
2179 * always lock the page, as opposed to sometimes
2180 * returning with the page locked. This way we avoid a
2181 * fault on the ensuing uiomove(), but also
2182 * more importantly (to fix bug 1094402) we can
2183 * call segmap_fault() to unlock the page in all
2184 * cases. An alternative would be to modify
2185 * segmap_pagecreate() to tell us when it is
2186 * locking a page, but that's a fairly major
2187 * interface change.
2188 */
2189 if (pgcreated == 0)
2190 (void) segmap_pagecreate(segkmap, base,
2191 (uint_t)n, 1);
2192 saved_base = base;
2193 saved_n = n;
2194 }
2195
2196 /*
2197 * The number of bytes of data in the last page can not
2198 * be accurately be determined while page is being
2199 * uiomove'd to and the size of the file being updated.
2200 * Thus, inform threads which need to know accurately
2201 * how much data is in the last page of the file. They
2202 * will not do the i/o immediately, but will arrange for
2203 * the i/o to happen later when this modify operation
2204 * will have finished.
2205 */
2206 ASSERT(!(rp->r_flags & RMODINPROGRESS));
2207 mutex_enter(&rp->r_statelock);
2208 rp->r_flags |= RMODINPROGRESS;
2209 rp->r_modaddr = (offset & MAXBMASK);
2210 mutex_exit(&rp->r_statelock);
2211
2212 error = uiomove(base, n, UIO_WRITE, uio);
2213
2214 /*
2215 * r_size is the maximum number of
2216 * bytes known to be in the file.
2217 * Make sure it is at least as high as the
2218 * first unwritten byte pointed to by uio_loffset.
2219 */
2220 mutex_enter(&rp->r_statelock);
2221 if (rp->r_size < uio->uio_loffset)
2222 rp->r_size = uio->uio_loffset;
2223 rp->r_flags &= ~RMODINPROGRESS;
2224 rp->r_flags |= RDIRTY;
2225 mutex_exit(&rp->r_statelock);
2226
2227 /* n = # of bytes written */
2228 n = (int)(uio->uio_loffset - offset);
2229 base += n;
2230 tcount -= n;
2231 /*
2232 * If we created pages w/o initializing them completely,
2233 * we need to zero the part that wasn't set up.
2234 * This happens on a most EOF write cases and if
2235 * we had some sort of error during the uiomove.
2236 */
2237 if (pagecreate) {
2238 if ((uio->uio_loffset & PAGEOFFSET) || n == 0)
2239 (void) kzero(base, PAGESIZE - n);
2240
2241 if (pgcreated) {
2242 /*
2243 * Caller is responsible for this page,
2244 * it was not created in this loop.
2245 */
2246 pgcreated = 0;
2247 } else {
2248 /*
2249 * For bug 1094402: segmap_pagecreate locks
2250 * page. Unlock it. This also unlocks the
2251 * pages allocated by page_create_va() in
2252 * segmap_pagecreate().
2253 */
2254 sm_error = segmap_fault(kas.a_hat, segkmap,
2255 saved_base, saved_n,
2256 F_SOFTUNLOCK, S_WRITE);
2257 if (error == 0)
2258 error = sm_error;
2259 }
2260 }
2261 } while (tcount > 0 && error == 0);
2262
2263 return (error);
2264 }
2265
2266 int
2267 nfs_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr)
2268 {
2269 rnode_t *rp;
2270 page_t *pp;
2271 u_offset_t eoff;
2272 u_offset_t io_off;
2273 size_t io_len;
2274 int error;
2275 int rdirty;
2276 int err;
2277
2278 rp = VTOR(vp);
2279 ASSERT(rp->r_count > 0);
2280
2281 if (!vn_has_cached_data(vp))
2282 return (0);
2283
2284 ASSERT(vp->v_type != VCHR);
2285
2286 /*
2287 * If ROUTOFSPACE is set, then all writes turn into B_INVAL
2288 * writes. B_FORCE is set to force the VM system to actually
2289 * invalidate the pages, even if the i/o failed. The pages
2290 * need to get invalidated because they can't be written out
2291 * because there isn't any space left on either the server's
2292 * file system or in the user's disk quota. The B_FREE bit
2293 * is cleared to avoid confusion as to whether this is a
2294 * request to place the page on the freelist or to destroy
2295 * it.
2296 */
2297 if ((rp->r_flags & ROUTOFSPACE) ||
2298 (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
2299 flags = (flags & ~B_FREE) | B_INVAL | B_FORCE;
2300
2301 if (len == 0) {
2302 /*
2303 * If doing a full file synchronous operation, then clear
2304 * the RDIRTY bit. If a page gets dirtied while the flush
2305 * is happening, then RDIRTY will get set again. The
2306 * RDIRTY bit must get cleared before the flush so that
2307 * we don't lose this information.
2308 */
2309 if (off == (u_offset_t)0 &&
2310 !(flags & B_ASYNC) &&
2311 (rp->r_flags & RDIRTY)) {
2312 mutex_enter(&rp->r_statelock);
2313 rdirty = (rp->r_flags & RDIRTY);
2314 rp->r_flags &= ~RDIRTY;
2315 mutex_exit(&rp->r_statelock);
2316 } else
2317 rdirty = 0;
2318
2319 /*
2320 * Search the entire vp list for pages >= off, and flush
2321 * the dirty pages.
2322 */
2323 error = pvn_vplist_dirty(vp, off, rp->r_putapage,
2324 flags, cr);
2325
2326 /*
2327 * If an error occured and the file was marked as dirty
2328 * before and we aren't forcibly invalidating pages, then
2329 * reset the RDIRTY flag.
2330 */
2331 if (error && rdirty &&
2332 (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) {
2333 mutex_enter(&rp->r_statelock);
2334 rp->r_flags |= RDIRTY;
2335 mutex_exit(&rp->r_statelock);
2336 }
2337 } else {
2338 /*
2339 * Do a range from [off...off + len) looking for pages
2340 * to deal with.
2341 */
2342 error = 0;
2343 #ifdef lint
2344 io_len = 0;
2345 #endif
2346 eoff = off + len;
2347 mutex_enter(&rp->r_statelock);
2348 for (io_off = off; io_off < eoff && io_off < rp->r_size;
2349 io_off += io_len) {
2350 mutex_exit(&rp->r_statelock);
2351 /*
2352 * If we are not invalidating, synchronously
2353 * freeing or writing pages use the routine
2354 * page_lookup_nowait() to prevent reclaiming
2355 * them from the free list.
2356 */
2357 if ((flags & B_INVAL) || !(flags & B_ASYNC)) {
2358 pp = page_lookup(vp, io_off,
2359 (flags & (B_INVAL | B_FREE)) ?
2360 SE_EXCL : SE_SHARED);
2361 } else {
2362 pp = page_lookup_nowait(vp, io_off,
2363 (flags & B_FREE) ? SE_EXCL : SE_SHARED);
2364 }
2365
2366 if (pp == NULL || !pvn_getdirty(pp, flags))
2367 io_len = PAGESIZE;
2368 else {
2369 err = (*rp->r_putapage)(vp, pp, &io_off,
2370 &io_len, flags, cr);
2371 if (!error)
2372 error = err;
2373 /*
2374 * "io_off" and "io_len" are returned as
2375 * the range of pages we actually wrote.
2376 * This allows us to skip ahead more quickly
2377 * since several pages may've been dealt
2378 * with by this iteration of the loop.
2379 */
2380 }
2381 mutex_enter(&rp->r_statelock);
2382 }
2383 mutex_exit(&rp->r_statelock);
2384 }
2385
2386 return (error);
2387 }
2388
2389 void
2390 nfs_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr)
2391 {
2392 rnode_t *rp;
2393
2394 rp = VTOR(vp);
2395 mutex_enter(&rp->r_statelock);
2396 while (rp->r_flags & RTRUNCATE)
2397 cv_wait(&rp->r_cv, &rp->r_statelock);
2398 rp->r_flags |= RTRUNCATE;
2399 if (off == (u_offset_t)0) {
2400 rp->r_flags &= ~RDIRTY;
2401 if (!(rp->r_flags & RSTALE))
2402 rp->r_error = 0;
2403 }
2404 rp->r_truncaddr = off;
2405 mutex_exit(&rp->r_statelock);
2406 (void) pvn_vplist_dirty(vp, off, rp->r_putapage,
2407 B_INVAL | B_TRUNC, cr);
2408 mutex_enter(&rp->r_statelock);
2409 rp->r_flags &= ~RTRUNCATE;
2410 cv_broadcast(&rp->r_cv);
2411 mutex_exit(&rp->r_statelock);
2412 }
2413
2414 static int nfs_write_error_to_cons_only = 0;
2415 #define MSG(x) (nfs_write_error_to_cons_only ? (x) : (x) + 1)
2416
2417 /*
2418 * Print a file handle
2419 */
2420 void
2421 nfs_printfhandle(nfs_fhandle *fhp)
2422 {
2423 int *ip;
2424 char *buf;
2425 size_t bufsize;
2426 char *cp;
2427
2428 /*
2429 * 13 == "(file handle:"
2430 * maximum of NFS_FHANDLE / sizeof (*ip) elements in fh_buf times
2431 * 1 == ' '
2432 * 8 == maximum strlen of "%x"
2433 * 3 == ")\n\0"
2434 */
2435 bufsize = 13 + ((NFS_FHANDLE_LEN / sizeof (*ip)) * (1 + 8)) + 3;
2436 buf = kmem_alloc(bufsize, KM_NOSLEEP);
2437 if (buf == NULL)
2438 return;
2439
2440 cp = buf;
2441 (void) strcpy(cp, "(file handle:");
2442 while (*cp != '\0')
2443 cp++;
2444 for (ip = (int *)fhp->fh_buf;
2445 ip < (int *)&fhp->fh_buf[fhp->fh_len];
2446 ip++) {
2447 (void) sprintf(cp, " %x", *ip);
2448 while (*cp != '\0')
2449 cp++;
2450 }
2451 (void) strcpy(cp, ")\n");
2452
2453 zcmn_err(getzoneid(), CE_CONT, MSG("^%s"), buf);
2454
2455 kmem_free(buf, bufsize);
2456 }
2457
2458 /*
2459 * Notify the system administrator that an NFS write error has
2460 * occurred.
2461 */
2462
2463 /* seconds between ENOSPC/EDQUOT messages */
2464 clock_t nfs_write_error_interval = 5;
2465
2466 void
2467 nfs_write_error(vnode_t *vp, int error, cred_t *cr)
2468 {
2469 mntinfo_t *mi;
2470
2471 mi = VTOMI(vp);
2472 /*
2473 * In case of forced unmount or zone shutdown, do not print any
2474 * messages since it can flood the console with error messages.
2475 */
2476 if (FS_OR_ZONE_GONE(mi->mi_vfsp))
2477 return;
2478
2479 /*
2480 * No use in flooding the console with ENOSPC
2481 * messages from the same file system.
2482 */
2483 if ((error != ENOSPC && error != EDQUOT) ||
2484 lbolt - mi->mi_printftime > 0) {
2485 zoneid_t zoneid = mi->mi_zone->zone_id;
2486
2487 #ifdef DEBUG
2488 nfs_perror(error, "NFS%ld write error on host %s: %m.\n",
2489 mi->mi_vers, VTOR(vp)->r_server->sv_hostname, NULL);
2490 #else
2491 nfs_perror(error, "NFS write error on host %s: %m.\n",
2492 VTOR(vp)->r_server->sv_hostname, NULL);
2493 #endif
2494 if (error == ENOSPC || error == EDQUOT) {
2495 zcmn_err(zoneid, CE_CONT,
2496 MSG("^File: userid=%d, groupid=%d\n"),
2497 crgetuid(cr), crgetgid(cr));
2498 if (crgetuid(CRED()) != crgetuid(cr) ||
2499 crgetgid(CRED()) != crgetgid(cr)) {
2500 zcmn_err(zoneid, CE_CONT,
2501 MSG("^User: userid=%d, groupid=%d\n"),
2502 crgetuid(CRED()), crgetgid(CRED()));
2503 }
2504 mi->mi_printftime = lbolt +
2505 nfs_write_error_interval * hz;
2506 }
2507 nfs_printfhandle(&VTOR(vp)->r_fh);
2508 #ifdef DEBUG
2509 if (error == EACCES) {
2510 zcmn_err(zoneid, CE_CONT,
2511 MSG("^nfs_bio: cred is%s kcred\n"),
2512 cr == kcred ? "" : " not");
2513 }
2514 #endif
2515 }
2516 }
2517
2518 /* ARGSUSED */
2519 static void *
2520 nfs_mi_init(zoneid_t zoneid)
2521 {
2522 struct mi_globals *mig;
2523
2524 mig = kmem_alloc(sizeof (*mig), KM_SLEEP);
2525 mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL);
2526 list_create(&mig->mig_list, sizeof (mntinfo_t),
2527 offsetof(mntinfo_t, mi_zone_node));
2528 mig->mig_destructor_called = B_FALSE;
2529 return (mig);
2530 }
2531
2532 /*
2533 * Callback routine to tell all NFS mounts in the zone to stop creating new
2534 * threads. Existing threads should exit.
2535 */
2536 /* ARGSUSED */
2537 static void
2538 nfs_mi_shutdown(zoneid_t zoneid, void *data)
2539 {
2540 struct mi_globals *mig = data;
2541 mntinfo_t *mi;
2542
2543 ASSERT(mig != NULL);
2544 again:
2545 mutex_enter(&mig->mig_lock);
2546 for (mi = list_head(&mig->mig_list); mi != NULL;
2547 mi = list_next(&mig->mig_list, mi)) {
2548
2549 /*
2550 * If we've done the shutdown work for this FS, skip.
2551 * Once we go off the end of the list, we're done.
2552 */
2553 if (mi->mi_flags & MI_DEAD)
2554 continue;
2555
2556 /*
2557 * We will do work, so not done. Get a hold on the FS.
2558 */
2559 VFS_HOLD(mi->mi_vfsp);
2560
2561 /*
2562 * purge the DNLC for this filesystem
2563 */
2564 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
2565
2566 mutex_enter(&mi->mi_async_lock);
2567 /*
2568 * Tell existing async worker threads to exit.
2569 */
2570 mi->mi_max_threads = 0;
2571 cv_broadcast(&mi->mi_async_work_cv);
2572 /*
2573 * Set MI_ASYNC_MGR_STOP so the async manager thread starts
2574 * getting ready to exit when it's done with its current work.
2575 * Also set MI_DEAD to note we've acted on this FS.
2576 */
2577 mutex_enter(&mi->mi_lock);
2578 mi->mi_flags |= (MI_ASYNC_MGR_STOP|MI_DEAD);
2579 mutex_exit(&mi->mi_lock);
2580 /*
2581 * Wake up the async manager thread.
2582 */
2583 cv_broadcast(&mi->mi_async_reqs_cv);
2584 mutex_exit(&mi->mi_async_lock);
2585
2586 /*
2587 * Drop lock and release FS, which may change list, then repeat.
2588 * We're done when every mi has been done or the list is empty.
2589 */
2590 mutex_exit(&mig->mig_lock);
2591 VFS_RELE(mi->mi_vfsp);
2592 goto again;
2593 }
2594 mutex_exit(&mig->mig_lock);
2595 }
2596
2597 static void
2598 nfs_mi_free_globals(struct mi_globals *mig)
2599 {
2600 list_destroy(&mig->mig_list); /* makes sure the list is empty */
2601 mutex_destroy(&mig->mig_lock);
2602 kmem_free(mig, sizeof (*mig));
2603
2604 }
2605
2606 /* ARGSUSED */
2607 static void
2608 nfs_mi_destroy(zoneid_t zoneid, void *data)
2609 {
2610 struct mi_globals *mig = data;
2611
2612 ASSERT(mig != NULL);
2613 mutex_enter(&mig->mig_lock);
2614 if (list_head(&mig->mig_list) != NULL) {
2615 /* Still waiting for VFS_FREEVFS() */
2616 mig->mig_destructor_called = B_TRUE;
2617 mutex_exit(&mig->mig_lock);
2618 return;
2619 }
2620 nfs_mi_free_globals(mig);
2621 }
2622
2623 /*
2624 * Add an NFS mount to the per-zone list of NFS mounts.
2625 */
2626 void
2627 nfs_mi_zonelist_add(mntinfo_t *mi)
2628 {
2629 struct mi_globals *mig;
2630
2631 mig = zone_getspecific(mi_list_key, mi->mi_zone);
2632 mutex_enter(&mig->mig_lock);
2633 list_insert_head(&mig->mig_list, mi);
2634 mutex_exit(&mig->mig_lock);
2635 }
2636
2637 /*
2638 * Remove an NFS mount from the per-zone list of NFS mounts.
2639 */
2640 static void
2641 nfs_mi_zonelist_remove(mntinfo_t *mi)
2642 {
2643 struct mi_globals *mig;
2644
2645 mig = zone_getspecific(mi_list_key, mi->mi_zone);
2646 mutex_enter(&mig->mig_lock);
2647 list_remove(&mig->mig_list, mi);
2648 /*
2649 * We can be called asynchronously by VFS_FREEVFS() after the zone
2650 * shutdown/destroy callbacks have executed; if so, clean up the zone's
2651 * mi globals.
2652 */
2653 if (list_head(&mig->mig_list) == NULL &&
2654 mig->mig_destructor_called == B_TRUE) {
2655 nfs_mi_free_globals(mig);
2656 return;
2657 }
2658 mutex_exit(&mig->mig_lock);
2659 }
2660
2661 /*
2662 * NFS Client initialization routine. This routine should only be called
2663 * once. It performs the following tasks:
2664 * - Initalize all global locks
2665 * - Call sub-initialization routines (localize access to variables)
2666 */
2667 int
2668 nfs_clntinit(void)
2669 {
2670 #ifdef DEBUG
2671 static boolean_t nfs_clntup = B_FALSE;
2672 #endif
2673 int error;
2674
2675 #ifdef DEBUG
2676 ASSERT(nfs_clntup == B_FALSE);
2677 #endif
2678
2679 error = nfs_subrinit();
2680 if (error)
2681 return (error);
2682
2683 error = nfs_vfsinit();
2684 if (error) {
2685 /*
2686 * Cleanup nfs_subrinit() work
2687 */
2688 nfs_subrfini();
2689 return (error);
2690 }
2691 zone_key_create(&mi_list_key, nfs_mi_init, nfs_mi_shutdown,
2692 nfs_mi_destroy);
2693
2694 x_READ3args = xdr_READ3args;
2695 x_READ3res = xdr_READ3res;
2696 x_READ3vres = xdr_READ3vres;
2697 x_READ3uiores = xdr_READ3uiores;
2698
2699 nfs4_clnt_init();
2700
2701 #ifdef DEBUG
2702 nfs_clntup = B_TRUE;
2703 #endif
2704
2705 return (0);
2706 }
2707
2708 /*
2709 * This routine is only called if the NFS Client has been initialized but
2710 * the module failed to be installed. This routine will cleanup the previously
2711 * allocated/initialized work.
2712 */
2713 void
2714 nfs_clntfini(void)
2715 {
2716 (void) zone_key_delete(mi_list_key);
2717 nfs_subrfini();
2718 nfs_vfsfini();
2719 nfs4_clnt_fini();
2720 }
2721
2722 /*
2723 * nfs_lockrelease:
2724 *
2725 * Release any locks on the given vnode that are held by the current
2726 * process.
2727 */
2728 void
2729 nfs_lockrelease(vnode_t *vp, int flag, offset_t offset, cred_t *cr)
2730 {
2731 flock64_t ld;
2732 struct shrlock shr;
2733 char *buf;
2734 int remote_lock_possible;
2735 int ret;
2736
2737 ASSERT((uintptr_t)vp > KERNELBASE);
2738
2739 /*
2740 * Generate an explicit unlock operation for the entire file. As a
2741 * partial optimization, only generate the unlock if there is a
2742 * lock registered for the file. We could check whether this
2743 * particular process has any locks on the file, but that would
2744 * require the local locking code to provide yet another query
2745 * routine. Note that no explicit synchronization is needed here.
2746 * At worst, flk_has_remote_locks() will return a false positive,
2747 * in which case the unlock call wastes time but doesn't harm
2748 * correctness.
2749 *
2750 * In addition, an unlock request is generated if the process
2751 * is listed as possibly having a lock on the file because the
2752 * server and client lock managers may have gotten out of sync.
2753 * N.B. It is important to make sure nfs_remove_locking_id() is
2754 * called here even if flk_has_remote_locks(vp) reports true.
2755 * If it is not called and there is an entry on the process id
2756 * list, that entry will never get removed.
2757 */
2758 remote_lock_possible = nfs_remove_locking_id(vp, RLMPL_PID,
2759 (char *)&(ttoproc(curthread)->p_pid), NULL, NULL);
2760 if (remote_lock_possible || flk_has_remote_locks(vp)) {
2761 ld.l_type = F_UNLCK; /* set to unlock entire file */
2762 ld.l_whence = 0; /* unlock from start of file */
2763 ld.l_start = 0;
2764 ld.l_len = 0; /* do entire file */
2765 ret = VOP_FRLOCK(vp, F_SETLK, &ld, flag, offset, NULL, cr);
2766
2767 if (ret != 0) {
2768 /*
2769 * If VOP_FRLOCK fails, make sure we unregister
2770 * local locks before we continue.
2771 */
2772 ld.l_pid = ttoproc(curthread)->p_pid;
2773 lm_register_lock_locally(vp, NULL, &ld, flag, offset);
2774 #ifdef DEBUG
2775 nfs_perror(ret,
2776 "NFS lock release error on vp %p: %m.\n",
2777 (void *)vp, NULL);
2778 #endif
2779 }
2780
2781 /*
2782 * The call to VOP_FRLOCK may put the pid back on the
2783 * list. We need to remove it.
2784 */
2785 (void) nfs_remove_locking_id(vp, RLMPL_PID,
2786 (char *)&(ttoproc(curthread)->p_pid), NULL, NULL);
2787 }
2788
2789 /*
2790 * As long as the vp has a share matching our pid,
2791 * pluck it off and unshare it. There are circumstances in
2792 * which the call to nfs_remove_locking_id() may put the
2793 * owner back on the list, in which case we simply do a
2794 * redundant and harmless unshare.
2795 */
2796 buf = kmem_alloc(MAX_SHR_OWNER_LEN, KM_SLEEP);
2797 while (nfs_remove_locking_id(vp, RLMPL_OWNER,
2798 (char *)NULL, buf, &shr.s_own_len)) {
2799 shr.s_owner = buf;
2800 shr.s_access = 0;
2801 shr.s_deny = 0;
2802 shr.s_sysid = 0;
2803 shr.s_pid = curproc->p_pid;
2804
2805 ret = VOP_SHRLOCK(vp, F_UNSHARE, &shr, flag, cr);
2806 #ifdef DEBUG
2807 if (ret != 0) {
2808 nfs_perror(ret,
2809 "NFS share release error on vp %p: %m.\n",
2810 (void *)vp, NULL);
2811 }
2812 #endif
2813 }
2814 kmem_free(buf, MAX_SHR_OWNER_LEN);
2815 }
2816
2817 /*
2818 * nfs_lockcompletion:
2819 *
2820 * If the vnode has a lock that makes it unsafe to cache the file, mark it
2821 * as non cachable (set VNOCACHE bit).
2822 */
2823
2824 void
2825 nfs_lockcompletion(vnode_t *vp, int cmd)
2826 {
2827 #ifdef DEBUG
2828 rnode_t *rp = VTOR(vp);
2829
2830 ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
2831 #endif
2832
2833 if (cmd == F_SETLK || cmd == F_SETLKW) {
2834 if (!lm_safemap(vp)) {
2835 mutex_enter(&vp->v_lock);
2836 vp->v_flag |= VNOCACHE;
2837 mutex_exit(&vp->v_lock);
2838 } else {
2839 mutex_enter(&vp->v_lock);
2840 vp->v_flag &= ~VNOCACHE;
2841 mutex_exit(&vp->v_lock);
2842 }
2843 }
2844 /*
2845 * The cached attributes of the file are stale after acquiring
2846 * the lock on the file. They were updated when the file was
2847 * opened, but not updated when the lock was acquired. Therefore the
2848 * cached attributes are invalidated after the lock is obtained.
2849 */
2850 PURGE_ATTRCACHE(vp);
2851 }
2852
2853 /*
2854 * The lock manager holds state making it possible for the client
2855 * and server to be out of sync. For example, if the response from
2856 * the server granting a lock request is lost, the server will think
2857 * the lock is granted and the client will think the lock is lost.
2858 * The client can tell when it is not positive if it is in sync with
2859 * the server.
2860 *
2861 * To deal with this, a list of processes for which the client is
2862 * not sure if the server holds a lock is attached to the rnode.
2863 * When such a process closes the rnode, an unlock request is sent
2864 * to the server to unlock the entire file.
2865 *
2866 * The list is kept as a singularly linked NULL terminated list.
2867 * Because it is only added to under extreme error conditions, the
2868 * list shouldn't get very big. DEBUG kernels print a message if
2869 * the list gets bigger than nfs_lmpl_high_water. This is arbitrarily
2870 * choosen to be 8, but can be tuned at runtime.
2871 */
2872 #ifdef DEBUG
2873 /* int nfs_lmpl_high_water = 8; */
2874 int nfs_lmpl_high_water = 128;
2875 int nfs_cnt_add_locking_id = 0;
2876 int nfs_len_add_locking_id = 0;
2877 #endif /* DEBUG */
2878
2879 /*
2880 * Record that the nfs lock manager server may be holding a lock on
2881 * a vnode for a process.
2882 *
2883 * Because the nfs lock manager server holds state, it is possible
2884 * for the server to get out of sync with the client. This routine is called
2885 * from the client when it is no longer sure if the server is in sync
2886 * with the client. nfs_lockrelease() will then notice this and send
2887 * an unlock request when the file is closed
2888 */
2889 void
2890 nfs_add_locking_id(vnode_t *vp, pid_t pid, int type, char *id, int len)
2891 {
2892 rnode_t *rp;
2893 lmpl_t *new;
2894 lmpl_t *cur;
2895 lmpl_t **lmplp;
2896 #ifdef DEBUG
2897 int list_len = 1;
2898 #endif /* DEBUG */
2899
2900 #ifdef DEBUG
2901 ++nfs_cnt_add_locking_id;
2902 #endif /* DEBUG */
2903 /*
2904 * allocate new lmpl_t now so we don't sleep
2905 * later after grabbing mutexes
2906 */
2907 ASSERT(len < MAX_SHR_OWNER_LEN);
2908 new = kmem_alloc(sizeof (*new), KM_SLEEP);
2909 new->lmpl_type = type;
2910 new->lmpl_pid = pid;
2911 new->lmpl_owner = kmem_alloc(len, KM_SLEEP);
2912 bcopy(id, new->lmpl_owner, len);
2913 new->lmpl_own_len = len;
2914 new->lmpl_next = (lmpl_t *)NULL;
2915 #ifdef DEBUG
2916 if (type == RLMPL_PID) {
2917 ASSERT(len == sizeof (pid_t));
2918 ASSERT(pid == *(pid_t *)new->lmpl_owner);
2919 } else {
2920 ASSERT(type == RLMPL_OWNER);
2921 }
2922 #endif
2923
2924 rp = VTOR(vp);
2925 mutex_enter(&rp->r_statelock);
2926
2927 /*
2928 * Add this id to the list for this rnode only if the
2929 * rnode is active and the id is not already there.
2930 */
2931 ASSERT(rp->r_flags & RHASHED);
2932 lmplp = &(rp->r_lmpl);
2933 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; cur = cur->lmpl_next) {
2934 if (cur->lmpl_pid == pid &&
2935 cur->lmpl_type == type &&
2936 cur->lmpl_own_len == len &&
2937 bcmp(cur->lmpl_owner, new->lmpl_owner, len) == 0) {
2938 kmem_free(new->lmpl_owner, len);
2939 kmem_free(new, sizeof (*new));
2940 break;
2941 }
2942 lmplp = &cur->lmpl_next;
2943 #ifdef DEBUG
2944 ++list_len;
2945 #endif /* DEBUG */
2946 }
2947 if (cur == (lmpl_t *)NULL) {
2948 *lmplp = new;
2949 #ifdef DEBUG
2950 if (list_len > nfs_len_add_locking_id) {
2951 nfs_len_add_locking_id = list_len;
2952 }
2953 if (list_len > nfs_lmpl_high_water) {
2954 cmn_err(CE_WARN, "nfs_add_locking_id: long list "
2955 "vp=%p is %d", (void *)vp, list_len);
2956 }
2957 #endif /* DEBUG */
2958 }
2959
2960 #ifdef DEBUG
2961 if (share_debug) {
2962 int nitems = 0;
2963 int npids = 0;
2964 int nowners = 0;
2965
2966 /*
2967 * Count the number of things left on r_lmpl after the remove.
2968 */
2969 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL;
2970 cur = cur->lmpl_next) {
2971 nitems++;
2972 if (cur->lmpl_type == RLMPL_PID) {
2973 npids++;
2974 } else if (cur->lmpl_type == RLMPL_OWNER) {
2975 nowners++;
2976 } else {
2977 cmn_err(CE_PANIC, "nfs_add_locking_id: "
2978 "unrecognised lmpl_type %d",
2979 cur->lmpl_type);
2980 }
2981 }
2982
2983 cmn_err(CE_CONT, "nfs_add_locking_id(%s): %d PIDs + %d "
2984 "OWNs = %d items left on r_lmpl\n",
2985 (type == RLMPL_PID) ? "P" : "O", npids, nowners, nitems);
2986 }
2987 #endif
2988
2989 mutex_exit(&rp->r_statelock);
2990 }
2991
2992 /*
2993 * Remove an id from the lock manager id list.
2994 *
2995 * If the id is not in the list return 0. If it was found and
2996 * removed, return 1.
2997 */
2998 static int
2999 nfs_remove_locking_id(vnode_t *vp, int type, char *id, char *rid, int *rlen)
3000 {
3001 lmpl_t *cur;
3002 lmpl_t **lmplp;
3003 rnode_t *rp;
3004 int rv = 0;
3005
3006 ASSERT(type == RLMPL_PID || type == RLMPL_OWNER);
3007
3008 rp = VTOR(vp);
3009
3010 mutex_enter(&rp->r_statelock);
3011 ASSERT(rp->r_flags & RHASHED);
3012 lmplp = &(rp->r_lmpl);
3013
3014 /*
3015 * Search through the list and remove the entry for this id
3016 * if it is there. The special case id == NULL allows removal
3017 * of the first share on the r_lmpl list belonging to the
3018 * current process (if any), without regard to further details
3019 * of its identity.
3020 */
3021 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; cur = cur->lmpl_next) {
3022 if (cur->lmpl_type == type &&
3023 cur->lmpl_pid == curproc->p_pid &&
3024 (id == (char *)NULL ||
3025 bcmp(cur->lmpl_owner, id, cur->lmpl_own_len) == 0)) {
3026 *lmplp = cur->lmpl_next;
3027 ASSERT(cur->lmpl_own_len < MAX_SHR_OWNER_LEN);
3028 if (rid != NULL) {
3029 bcopy(cur->lmpl_owner, rid, cur->lmpl_own_len);
3030 *rlen = cur->lmpl_own_len;
3031 }
3032 kmem_free(cur->lmpl_owner, cur->lmpl_own_len);
3033 kmem_free(cur, sizeof (*cur));
3034 rv = 1;
3035 break;
3036 }
3037 lmplp = &cur->lmpl_next;
3038 }
3039
3040 #ifdef DEBUG
3041 if (share_debug) {
3042 int nitems = 0;
3043 int npids = 0;
3044 int nowners = 0;
3045
3046 /*
3047 * Count the number of things left on r_lmpl after the remove.
3048 */
3049 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL;
3050 cur = cur->lmpl_next) {
3051 nitems++;
3052 if (cur->lmpl_type == RLMPL_PID) {
3053 npids++;
3054 } else if (cur->lmpl_type == RLMPL_OWNER) {
3055 nowners++;
3056 } else {
3057 cmn_err(CE_PANIC,
3058 "nrli: unrecognised lmpl_type %d",
3059 cur->lmpl_type);
3060 }
3061 }
3062
3063 cmn_err(CE_CONT,
3064 "nrli(%s): %d PIDs + %d OWNs = %d items left on r_lmpl\n",
3065 (type == RLMPL_PID) ? "P" : "O",
3066 npids,
3067 nowners,
3068 nitems);
3069 }
3070 #endif
3071
3072 mutex_exit(&rp->r_statelock);
3073 return (rv);
3074 }
3075
3076 void
3077 nfs_free_mi(mntinfo_t *mi)
3078 {
3079 ASSERT(mi->mi_flags & MI_ASYNC_MGR_STOP);
3080 ASSERT(mi->mi_manager_thread == NULL);
3081 ASSERT(mi->mi_threads == 0);
3082
3083 /*
3084 * Remove the node from the global list before we start tearing it down.
3085 */
3086 nfs_mi_zonelist_remove(mi);
3087 if (mi->mi_klmconfig) {
3088 lm_free_config(mi->mi_klmconfig);
3089 kmem_free(mi->mi_klmconfig, sizeof (struct knetconfig));
3090 }
3091 mutex_destroy(&mi->mi_lock);
3092 mutex_destroy(&mi->mi_remap_lock);
3093 mutex_destroy(&mi->mi_async_lock);
3094 cv_destroy(&mi->mi_failover_cv);
3095 cv_destroy(&mi->mi_async_work_cv);
3096 cv_destroy(&mi->mi_async_reqs_cv);
3097 cv_destroy(&mi->mi_async_cv);
3098 zone_rele(mi->mi_zone);
3099 kmem_free(mi, sizeof (*mi));
3100 }
3101
3102 static int
3103 mnt_kstat_update(kstat_t *ksp, int rw)
3104 {
3105 mntinfo_t *mi;
3106 struct mntinfo_kstat *mik;
3107 vfs_t *vfsp;
3108 int i;
3109
3110 /* this is a read-only kstat. Bail out on a write */
3111 if (rw == KSTAT_WRITE)
3112 return (EACCES);
3113
3114 /*
3115 * We don't want to wait here as kstat_chain_lock could be held by
3116 * dounmount(). dounmount() takes vfs_reflock before the chain lock
3117 * and thus could lead to a deadlock.
3118 */
3119 vfsp = (struct vfs *)ksp->ks_private;
3120
3121
3122 mi = VFTOMI(vfsp);
3123
3124 mik = (struct mntinfo_kstat *)ksp->ks_data;
3125
3126 (void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto);
3127 mik->mik_vers = (uint32_t)mi->mi_vers;
3128 mik->mik_flags = mi->mi_flags;
3129 mik->mik_secmod = mi->mi_curr_serv->sv_secdata->secmod;
3130 mik->mik_curread = (uint32_t)mi->mi_curread;
3131 mik->mik_curwrite = (uint32_t)mi->mi_curwrite;
3132 mik->mik_retrans = mi->mi_retrans;
3133 mik->mik_timeo = mi->mi_timeo;
3134 mik->mik_acregmin = HR2SEC(mi->mi_acregmin);
3135 mik->mik_acregmax = HR2SEC(mi->mi_acregmax);
3136 mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin);
3137 mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax);
3138 for (i = 0; i < NFS_CALLTYPES + 1; i++) {
3139 mik->mik_timers[i].srtt = (uint32_t)mi->mi_timers[i].rt_srtt;
3140 mik->mik_timers[i].deviate =
3141 (uint32_t)mi->mi_timers[i].rt_deviate;
3142 mik->mik_timers[i].rtxcur =
3143 (uint32_t)mi->mi_timers[i].rt_rtxcur;
3144 }
3145 mik->mik_noresponse = (uint32_t)mi->mi_noresponse;
3146 mik->mik_failover = (uint32_t)mi->mi_failover;
3147 mik->mik_remap = (uint32_t)mi->mi_remap;
3148 (void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname);
3149
3150 return (0);
3151 }
3152
3153 void
3154 nfs_mnt_kstat_init(struct vfs *vfsp)
3155 {
3156 mntinfo_t *mi = VFTOMI(vfsp);
3157
3158 /*
3159 * Create the version specific kstats.
3160 *
3161 * PSARC 2001/697 Contract Private Interface
3162 * All nfs kstats are under SunMC contract
3163 * Please refer to the PSARC listed above and contact
3164 * SunMC before making any changes!
3165 *
3166 * Changes must be reviewed by Solaris File Sharing
3167 * Changes must be communicated to contract-2001-697@sun.com
3168 *
3169 */
3170
3171 mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev),
3172 NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id);
3173 if (mi->mi_io_kstats) {
3174 if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
3175 kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID);
3176 mi->mi_io_kstats->ks_lock = &mi->mi_lock;
3177 kstat_install(mi->mi_io_kstats);
3178 }
3179
3180 if ((mi->mi_ro_kstats = kstat_create_zone("nfs",
3181 getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW,
3182 sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) {
3183 if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
3184 kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID);
3185 mi->mi_ro_kstats->ks_update = mnt_kstat_update;
3186 mi->mi_ro_kstats->ks_private = (void *)vfsp;
3187 kstat_install(mi->mi_ro_kstats);
3188 }
3189 }
3190
3191 nfs_delmapcall_t *
3192 nfs_init_delmapcall()
3193 {
3194 nfs_delmapcall_t *delmap_call;
3195
3196 delmap_call = kmem_alloc(sizeof (nfs_delmapcall_t), KM_SLEEP);
3197 delmap_call->call_id = curthread;
3198 delmap_call->error = 0;
3199
3200 return (delmap_call);
3201 }
3202
3203 void
3204 nfs_free_delmapcall(nfs_delmapcall_t *delmap_call)
3205 {
3206 kmem_free(delmap_call, sizeof (nfs_delmapcall_t));
3207 }
3208
3209 /*
3210 * Searches for the current delmap caller (based on curthread) in the list of
3211 * callers. If it is found, we remove it and free the delmap caller.
3212 * Returns:
3213 * 0 if the caller wasn't found
3214 * 1 if the caller was found, removed and freed. *errp is set to what
3215 * the result of the delmap was.
3216 */
3217 int
3218 nfs_find_and_delete_delmapcall(rnode_t *rp, int *errp)
3219 {
3220 nfs_delmapcall_t *delmap_call;
3221
3222 /*
3223 * If the list doesn't exist yet, we create it and return
3224 * that the caller wasn't found. No list = no callers.
3225 */
3226 mutex_enter(&rp->r_statelock);
3227 if (!(rp->r_flags & RDELMAPLIST)) {
3228 /* The list does not exist */
3229 list_create(&rp->r_indelmap, sizeof (nfs_delmapcall_t),
3230 offsetof(nfs_delmapcall_t, call_node));
3231 rp->r_flags |= RDELMAPLIST;
3232 mutex_exit(&rp->r_statelock);
3233 return (0);
3234 } else {
3235 /* The list exists so search it */
3236 for (delmap_call = list_head(&rp->r_indelmap);
3237 delmap_call != NULL;
3238 delmap_call = list_next(&rp->r_indelmap, delmap_call)) {
3239 if (delmap_call->call_id == curthread) {
3240 /* current caller is in the list */
3241 *errp = delmap_call->error;
3242 list_remove(&rp->r_indelmap, delmap_call);
3243 mutex_exit(&rp->r_statelock);
3244 nfs_free_delmapcall(delmap_call);
3245 return (1);
3246 }
3247 }
3248 }
3249 mutex_exit(&rp->r_statelock);
3250 return (0);
3251 }