Old nfs_client.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License"). You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22 /*
23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 *
26 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T.
27 * All rights reserved.
28 */
29
30 #pragma ident "@(#)nfs_client.c 1.194 05/12/08 SMI"
31
32 #include <sys/param.h>
33 #include <sys/types.h>
34 #include <sys/systm.h>
35 #include <sys/thread.h>
36 #include <sys/t_lock.h>
37 #include <sys/time.h>
38 #include <sys/vnode.h>
39 #include <sys/vfs.h>
40 #include <sys/errno.h>
41 #include <sys/buf.h>
42 #include <sys/stat.h>
43 #include <sys/cred.h>
44 #include <sys/kmem.h>
45 #include <sys/debug.h>
46 #include <sys/dnlc.h>
47 #include <sys/vmsystm.h>
48 #include <sys/flock.h>
49 #include <sys/share.h>
50 #include <sys/cmn_err.h>
51 #include <sys/tiuser.h>
52 #include <sys/sysmacros.h>
53 #include <sys/callb.h>
54 #include <sys/acl.h>
55 #include <sys/kstat.h>
56 #include <sys/signal.h>
57 #include <sys/list.h>
58 #include <sys/zone.h>
59
60 #include <rpc/types.h>
61 #include <rpc/xdr.h>
62 #include <rpc/auth.h>
63 #include <rpc/clnt.h>
64
65 #include <nfs/nfs.h>
66 #include <nfs/nfs_clnt.h>
67
68 #include <nfs/rnode.h>
69 #include <nfs/nfs_acl.h>
70 #include <nfs/lm.h>
71
72 #include <vm/hat.h>
73 #include <vm/as.h>
74 #include <vm/page.h>
75 #include <vm/pvn.h>
76 #include <vm/seg.h>
77 #include <vm/seg_map.h>
78 #include <vm/seg_vn.h>
79
80 static void nfs3_attr_cache(vnode_t *, vattr_t *, vattr_t *, hrtime_t,
81 cred_t *);
82 static int nfs_getattr_cache(vnode_t *, struct vattr *);
83 static int nfs_remove_locking_id(vnode_t *, int, char *, char *, int *);
84
85 struct mi_globals {
86 kmutex_t mig_lock; /* lock protecting mig_list */
87 list_t mig_list; /* list of NFS v2 or v3 mounts in zone */
88 boolean_t mig_destructor_called;
89 };
90
91 static zone_key_t mi_list_key;
92
93 /* Debugging flag for PC file shares. */
94 extern int share_debug;
95
96 /*
97 * Attributes caching:
98 *
99 * Attributes are cached in the rnode in struct vattr form.
100 * There is a time associated with the cached attributes (r_attrtime)
101 * which tells whether the attributes are valid. The time is initialized
102 * to the difference between current time and the modify time of the vnode
103 * when new attributes are cached. This allows the attributes for
104 * files that have changed recently to be timed out sooner than for files
105 * that have not changed for a long time. There are minimum and maximum
106 * timeout values that can be set per mount point.
107 */
108
109 int
110 nfs_waitfor_purge_complete(vnode_t *vp)
111 {
112 rnode_t *rp;
113 k_sigset_t smask;
114
115 rp = VTOR(vp);
116 if (rp->r_serial != NULL && rp->r_serial != curthread) {
117 mutex_enter(&rp->r_statelock);
118 sigintr(&smask, VTOMI(vp)->mi_flags & MI_INT);
119 while (rp->r_serial != NULL) {
120 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
121 sigunintr(&smask);
122 mutex_exit(&rp->r_statelock);
123 return (EINTR);
124 }
125 }
126 sigunintr(&smask);
127 mutex_exit(&rp->r_statelock);
128 }
129 return (0);
130 }
131
132 /*
133 * Validate caches by checking cached attributes. If the cached
134 * attributes have timed out, then get new attributes from the server.
135 * As a side affect, this will do cache invalidation if the attributes
136 * have changed.
137 *
138 * If the attributes have not timed out and if there is a cache
139 * invalidation being done by some other thread, then wait until that
140 * thread has completed the cache invalidation.
141 */
142 int
143 nfs_validate_caches(vnode_t *vp, cred_t *cr)
144 {
145 int error;
146 struct vattr va;
147
148 if (ATTRCACHE_VALID(vp)) {
149 error = nfs_waitfor_purge_complete(vp);
150 if (error)
151 return (error);
152 return (0);
153 }
154
155 va.va_mask = AT_ALL;
156 return (nfs_getattr_otw(vp, &va, cr));
157 }
158
159 /*
160 * Validate caches by checking cached attributes. If the cached
161 * attributes have timed out, then get new attributes from the server.
162 * As a side affect, this will do cache invalidation if the attributes
163 * have changed.
164 *
165 * If the attributes have not timed out and if there is a cache
166 * invalidation being done by some other thread, then wait until that
167 * thread has completed the cache invalidation.
168 */
169 int
170 nfs3_validate_caches(vnode_t *vp, cred_t *cr)
171 {
172 int error;
173 struct vattr va;
174
175 if (ATTRCACHE_VALID(vp)) {
176 error = nfs_waitfor_purge_complete(vp);
177 if (error)
178 return (error);
179 return (0);
180 }
181
182 va.va_mask = AT_ALL;
183 return (nfs3_getattr_otw(vp, &va, cr));
184 }
185
186 /*
187 * Purge all of the various NFS `data' caches.
188 */
189 void
190 nfs_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr)
191 {
192 rnode_t *rp;
193 char *contents;
194 int size;
195 int error;
196
197 /*
198 * Purge the DNLC for any entries which refer to this file.
199 * Avoid recursive entry into dnlc_purge_vp() in case of a directory.
200 */
201 rp = VTOR(vp);
202 mutex_enter(&rp->r_statelock);
203 if (vp->v_count > 1 &&
204 (vp->v_type == VDIR || purge_dnlc == NFS_PURGE_DNLC) &&
205 !(rp->r_flags & RINDNLCPURGE)) {
206 /*
207 * Set the RINDNLCPURGE flag to prevent recursive entry
208 * into dnlc_purge_vp()
209 */
210 if (vp->v_type == VDIR)
211 rp->r_flags |= RINDNLCPURGE;
212 mutex_exit(&rp->r_statelock);
213 dnlc_purge_vp(vp);
214 mutex_enter(&rp->r_statelock);
215 if (rp->r_flags & RINDNLCPURGE)
216 rp->r_flags &= ~RINDNLCPURGE;
217 }
218
219 /*
220 * Clear any readdir state bits and purge the readlink response cache.
221 */
222 contents = rp->r_symlink.contents;
223 size = rp->r_symlink.size;
224 rp->r_symlink.contents = NULL;
225 mutex_exit(&rp->r_statelock);
226
227 if (contents != NULL) {
228
229 kmem_free((void *)contents, size);
230 }
231
232 /*
233 * Flush the page cache.
234 */
235 if (vn_has_cached_data(vp)) {
236 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr);
237 if (error && (error == ENOSPC || error == EDQUOT)) {
238 mutex_enter(&rp->r_statelock);
239 if (!rp->r_error)
240 rp->r_error = error;
241 mutex_exit(&rp->r_statelock);
242 }
243 }
244
245 /*
246 * Flush the readdir response cache.
247 */
248 if (HAVE_RDDIR_CACHE(rp))
249 nfs_purge_rddir_cache(vp);
250 }
251
252 /*
253 * Purge the readdir cache of all entries
254 */
255 void
256 nfs_purge_rddir_cache(vnode_t *vp)
257 {
258 rnode_t *rp;
259 rddir_cache *rdc;
260 rddir_cache *nrdc;
261
262 rp = VTOR(vp);
263 top:
264 mutex_enter(&rp->r_statelock);
265 rp->r_direof = NULL;
266 rp->r_flags &= ~RLOOKUP;
267 rp->r_flags |= RREADDIRPLUS;
268 rdc = avl_first(&rp->r_dir);
269 while (rdc != NULL) {
270 nrdc = AVL_NEXT(&rp->r_dir, rdc);
271 avl_remove(&rp->r_dir, rdc);
272 rddir_cache_rele(rdc);
273 rdc = nrdc;
274 }
275 mutex_exit(&rp->r_statelock);
276 }
277
278 /*
279 * Do a cache check based on the post-operation attributes.
280 * Then make them the new cached attributes. If no attributes
281 * were returned, then mark the attributes as timed out.
282 */
283 void
284 nfs3_cache_post_op_attr(vnode_t *vp, post_op_attr *poap, hrtime_t t, cred_t *cr)
285 {
286 vattr_t attr;
287
288 if (!poap->attributes) {
289 PURGE_ATTRCACHE(vp);
290 return;
291 }
292 (void) nfs3_cache_fattr3(vp, &poap->attr, &attr, t, cr);
293 }
294
295 /*
296 * Same as above, but using a vattr
297 */
298 void
299 nfs3_cache_post_op_vattr(vnode_t *vp, post_op_vattr *poap, hrtime_t t,
300 cred_t *cr)
301 {
302 if (!poap->attributes) {
303 PURGE_ATTRCACHE(vp);
304 return;
305 }
306 nfs_attr_cache(vp, poap->fres.vap, t, cr);
307 }
308
309 /*
310 * Do a cache check based on the weak cache consistency attributes.
311 * These consist of a small set of pre-operation attributes and the
312 * full set of post-operation attributes.
313 *
314 * If we are given the pre-operation attributes, then use them to
315 * check the validity of the various caches. Then, if we got the
316 * post-operation attributes, make them the new cached attributes.
317 * If we didn't get the post-operation attributes, then mark the
318 * attribute cache as timed out so that the next reference will
319 * cause a GETATTR to the server to refresh with the current
320 * attributes.
321 *
322 * Otherwise, if we didn't get the pre-operation attributes, but
323 * we did get the post-operation attributes, then use these
324 * attributes to check the validity of the various caches. This
325 * will probably cause a flush of the caches because if the
326 * operation succeeded, the attributes of the object were changed
327 * in some way from the old post-operation attributes. This
328 * should be okay because it is the safe thing to do. After
329 * checking the data caches, then we make these the new cached
330 * attributes.
331 *
332 * Otherwise, we didn't get either the pre- or post-operation
333 * attributes. Simply mark the attribute cache as timed out so
334 * the next reference will cause a GETATTR to the server to
335 * refresh with the current attributes.
336 *
337 * If an error occurred trying to convert the over the wire
338 * attributes to a vattr, then simply mark the attribute cache as
339 * timed out.
340 */
341 void
342 nfs3_cache_wcc_data(vnode_t *vp, wcc_data *wccp, hrtime_t t, cred_t *cr)
343 {
344 vattr_t bva;
345 vattr_t ava;
346
347 if (wccp->after.attributes) {
348 if (fattr3_to_vattr(vp, &wccp->after.attr, &ava)) {
349 PURGE_ATTRCACHE(vp);
350 return;
351 }
352 if (wccp->before.attributes) {
353 bva.va_ctime.tv_sec = wccp->before.attr.ctime.seconds;
354 bva.va_ctime.tv_nsec = wccp->before.attr.ctime.nseconds;
355 bva.va_mtime.tv_sec = wccp->before.attr.mtime.seconds;
356 bva.va_mtime.tv_nsec = wccp->before.attr.mtime.nseconds;
357 bva.va_size = wccp->before.attr.size;
358 nfs3_attr_cache(vp, &bva, &ava, t, cr);
359 } else
360 nfs_attr_cache(vp, &ava, t, cr);
361 } else {
362 PURGE_ATTRCACHE(vp);
363 }
364 }
365
366 /*
367 * Set attributes cache for given vnode using nfsattr.
368 *
369 * This routine does not do cache validation with the attributes.
370 *
371 * If an error occurred trying to convert the over the wire
372 * attributes to a vattr, then simply mark the attribute cache as
373 * timed out.
374 */
375 void
376 nfs_attrcache(vnode_t *vp, struct nfsfattr *na, hrtime_t t)
377 {
378 rnode_t *rp;
379 struct vattr va;
380
381 if (!nattr_to_vattr(vp, na, &va)) {
382 rp = VTOR(vp);
383 mutex_enter(&rp->r_statelock);
384 if (rp->r_mtime <= t)
385 nfs_attrcache_va(vp, &va);
386 mutex_exit(&rp->r_statelock);
387 } else {
388 PURGE_ATTRCACHE(vp);
389 }
390 }
391
392 /*
393 * Set attributes cache for given vnode using fattr3.
394 *
395 * This routine does not do cache validation with the attributes.
396 *
397 * If an error occurred trying to convert the over the wire
398 * attributes to a vattr, then simply mark the attribute cache as
399 * timed out.
400 */
401 void
402 nfs3_attrcache(vnode_t *vp, fattr3 *na, hrtime_t t)
403 {
404 rnode_t *rp;
405 struct vattr va;
406
407 if (!fattr3_to_vattr(vp, na, &va)) {
408 rp = VTOR(vp);
409 mutex_enter(&rp->r_statelock);
410 if (rp->r_mtime <= t)
411 nfs_attrcache_va(vp, &va);
412 mutex_exit(&rp->r_statelock);
413 } else {
414 PURGE_ATTRCACHE(vp);
415 }
416 }
417
418 /*
419 * Do a cache check based on attributes returned over the wire. The
420 * new attributes are cached.
421 *
422 * If an error occurred trying to convert the over the wire attributes
423 * to a vattr, then just return that error.
424 *
425 * As a side affect, the vattr argument is filled in with the converted
426 * attributes.
427 */
428 int
429 nfs_cache_fattr(vnode_t *vp, struct nfsfattr *na, vattr_t *vap, hrtime_t t,
430 cred_t *cr)
431 {
432 int error;
433
434 error = nattr_to_vattr(vp, na, vap);
435 if (error)
436 return (error);
437 nfs_attr_cache(vp, vap, t, cr);
438 return (0);
439 }
440
441 /*
442 * Do a cache check based on attributes returned over the wire. The
443 * new attributes are cached.
444 *
445 * If an error occurred trying to convert the over the wire attributes
446 * to a vattr, then just return that error.
447 *
448 * As a side affect, the vattr argument is filled in with the converted
449 * attributes.
450 */
451 int
452 nfs3_cache_fattr3(vnode_t *vp, fattr3 *na, vattr_t *vap, hrtime_t t, cred_t *cr)
453 {
454 int error;
455
456 error = fattr3_to_vattr(vp, na, vap);
457 if (error)
458 return (error);
459 nfs_attr_cache(vp, vap, t, cr);
460 return (0);
461 }
462
463 /*
464 * Use the passed in virtual attributes to check to see whether the
465 * data and metadata caches are valid, cache the new attributes, and
466 * then do the cache invalidation if required.
467 *
468 * The cache validation and caching of the new attributes is done
469 * atomically via the use of the mutex, r_statelock. If required,
470 * the cache invalidation is done atomically w.r.t. the cache
471 * validation and caching of the attributes via the pseudo lock,
472 * r_serial.
473 *
474 * This routine is used to do cache validation and attributes caching
475 * for operations with a single set of post operation attributes.
476 */
477 void
478 nfs_attr_cache(vnode_t *vp, vattr_t *vap, hrtime_t t, cred_t *cr)
479 {
480 rnode_t *rp;
481 int mtime_changed;
482 int ctime_changed;
483 vsecattr_t *vsp;
484 int was_serial;
485
486 rp = VTOR(vp);
487
488 mutex_enter(&rp->r_statelock);
489
490 if (rp->r_serial != curthread) {
491 klwp_t *lwp = ttolwp(curthread);
492
493 was_serial = 0;
494 if (lwp != NULL)
495 lwp->lwp_nostop++;
496 while (rp->r_serial != NULL) {
497 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
498 mutex_exit(&rp->r_statelock);
499 if (lwp != NULL)
500 lwp->lwp_nostop--;
501 return;
502 }
503 }
504 if (lwp != NULL)
505 lwp->lwp_nostop--;
506 } else
507 was_serial = 1;
508
509 if (rp->r_mtime > t) {
510 mutex_exit(&rp->r_statelock);
511 return;
512 }
513
514 if (!(rp->r_flags & RWRITEATTR)) {
515 if (!CACHE_VALID(rp, vap->va_mtime, vap->va_size))
516 mtime_changed = 1;
517 else
518 mtime_changed = 0;
519 if (rp->r_attr.va_ctime.tv_sec != vap->va_ctime.tv_sec ||
520 rp->r_attr.va_ctime.tv_nsec != vap->va_ctime.tv_nsec)
521 ctime_changed = 1;
522 else
523 ctime_changed = 0;
524 } else if (rp->r_size != vap->va_size &&
525 (!vn_has_cached_data(vp) ||
526 (!(rp->r_flags & RDIRTY) && rp->r_count == 0))) {
527 mtime_changed = 1;
528 ctime_changed = 0;
529 } else {
530 mtime_changed = 0;
531 ctime_changed = 0;
532 }
533
534 nfs_attrcache_va(vp, vap);
535
536 if (!mtime_changed && !ctime_changed) {
537 mutex_exit(&rp->r_statelock);
538 return;
539 }
540
541 rp->r_serial = curthread;
542
543 mutex_exit(&rp->r_statelock);
544
545 if (mtime_changed)
546 nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr);
547
548 if (ctime_changed) {
549 (void) nfs_access_purge_rp(rp);
550 if (rp->r_secattr != NULL) {
551 mutex_enter(&rp->r_statelock);
552 vsp = rp->r_secattr;
553 rp->r_secattr = NULL;
554 mutex_exit(&rp->r_statelock);
555 if (vsp != NULL)
556 nfs_acl_free(vsp);
557 }
558 }
559
560 if (!was_serial) {
561 mutex_enter(&rp->r_statelock);
562 rp->r_serial = NULL;
563 cv_broadcast(&rp->r_cv);
564 mutex_exit(&rp->r_statelock);
565 }
566 }
567
568 /*
569 * Use the passed in "before" virtual attributes to check to see
570 * whether the data and metadata caches are valid, cache the "after"
571 * new attributes, and then do the cache invalidation if required.
572 *
573 * The cache validation and caching of the new attributes is done
574 * atomically via the use of the mutex, r_statelock. If required,
575 * the cache invalidation is done atomically w.r.t. the cache
576 * validation and caching of the attributes via the pseudo lock,
577 * r_serial.
578 *
579 * This routine is used to do cache validation and attributes caching
580 * for operations with both pre operation attributes and post operation
581 * attributes.
582 */
583 static void
584 nfs3_attr_cache(vnode_t *vp, vattr_t *bvap, vattr_t *avap, hrtime_t t,
585 cred_t *cr)
586 {
587 rnode_t *rp;
588 int mtime_changed;
589 int ctime_changed;
590 vsecattr_t *vsp;
591 int was_serial;
592
593 rp = VTOR(vp);
594
595 mutex_enter(&rp->r_statelock);
596
597 if (rp->r_serial != curthread) {
598 klwp_t *lwp = ttolwp(curthread);
599
600 was_serial = 0;
601 if (lwp != NULL)
602 lwp->lwp_nostop++;
603 while (rp->r_serial != NULL) {
604 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
605 mutex_exit(&rp->r_statelock);
606 if (lwp != NULL)
607 lwp->lwp_nostop--;
608 return;
609 }
610 }
611 if (lwp != NULL)
612 lwp->lwp_nostop--;
613 } else
614 was_serial = 1;
615
616 if (rp->r_mtime > t) {
617 mutex_exit(&rp->r_statelock);
618 return;
619 }
620
621 if (!(rp->r_flags & RWRITEATTR)) {
622 if (!CACHE_VALID(rp, bvap->va_mtime, bvap->va_size))
623 mtime_changed = 1;
624 else
625 mtime_changed = 0;
626 if (rp->r_attr.va_ctime.tv_sec != bvap->va_ctime.tv_sec ||
627 rp->r_attr.va_ctime.tv_nsec != bvap->va_ctime.tv_nsec)
628 ctime_changed = 1;
629 else
630 ctime_changed = 0;
631 } else {
632 mtime_changed = 0;
633 ctime_changed = 0;
634 }
635
636 nfs_attrcache_va(vp, avap);
637
638 if (!mtime_changed && !ctime_changed) {
639 mutex_exit(&rp->r_statelock);
640 return;
641 }
642
643 rp->r_serial = curthread;
644
645 mutex_exit(&rp->r_statelock);
646
647 if (mtime_changed)
648 nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr);
649
650 if (ctime_changed) {
651 (void) nfs_access_purge_rp(rp);
652 if (rp->r_secattr != NULL) {
653 mutex_enter(&rp->r_statelock);
654 vsp = rp->r_secattr;
655 rp->r_secattr = NULL;
656 mutex_exit(&rp->r_statelock);
657 if (vsp != NULL)
658 nfs_acl_free(vsp);
659 }
660 }
661
662 if (!was_serial) {
663 mutex_enter(&rp->r_statelock);
664 rp->r_serial = NULL;
665 cv_broadcast(&rp->r_cv);
666 mutex_exit(&rp->r_statelock);
667 }
668 }
669
670 /*
671 * Set attributes cache for given vnode using virtual attributes.
672 *
673 * Set the timeout value on the attribute cache and fill it
674 * with the passed in attributes.
675 *
676 * The caller must be holding r_statelock.
677 */
678 void
679 nfs_attrcache_va(vnode_t *vp, struct vattr *va)
680 {
681 rnode_t *rp;
682 mntinfo_t *mi;
683 hrtime_t delta;
684 hrtime_t now;
685
686 rp = VTOR(vp);
687
688 ASSERT(MUTEX_HELD(&rp->r_statelock));
689
690 now = gethrtime();
691
692 mi = VTOMI(vp);
693
694 /*
695 * Delta is the number of nanoseconds that we will
696 * cache the attributes of the file. It is based on
697 * the number of nanoseconds since the last time that
698 * we detected a change. The assumption is that files
699 * that changed recently are likely to change again.
700 * There is a minimum and a maximum for regular files
701 * and for directories which is enforced though.
702 *
703 * Using the time since last change was detected
704 * eliminates direct comparison or calculation
705 * using mixed client and server times. NFS does
706 * not make any assumptions regarding the client
707 * and server clocks being synchronized.
708 */
709 if (va->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec ||
710 va->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec ||
711 va->va_size != rp->r_attr.va_size)
712 rp->r_mtime = now;
713
714 if ((mi->mi_flags & MI_NOAC) || (vp->v_flag & VNOCACHE))
715 delta = 0;
716 else {
717 delta = now - rp->r_mtime;
718 if (vp->v_type == VDIR) {
719 if (delta < mi->mi_acdirmin)
720 delta = mi->mi_acdirmin;
721 else if (delta > mi->mi_acdirmax)
722 delta = mi->mi_acdirmax;
723 } else {
724 if (delta < mi->mi_acregmin)
725 delta = mi->mi_acregmin;
726 else if (delta > mi->mi_acregmax)
727 delta = mi->mi_acregmax;
728 }
729 }
730 rp->r_attrtime = now + delta;
731 rp->r_attr = *va;
732 /*
733 * Update the size of the file if there is no cached data or if
734 * the cached data is clean and there is no data being written
735 * out.
736 */
737 if (rp->r_size != va->va_size &&
738 (!vn_has_cached_data(vp) ||
739 (!(rp->r_flags & RDIRTY) && rp->r_count == 0)))
740 rp->r_size = va->va_size;
741 nfs_setswaplike(vp, va);
742 rp->r_flags &= ~RWRITEATTR;
743 }
744
745 /*
746 * Fill in attribute from the cache.
747 * If valid, then return 0 to indicate that no error occurred,
748 * otherwise return 1 to indicate that an error occurred.
749 */
750 static int
751 nfs_getattr_cache(vnode_t *vp, struct vattr *vap)
752 {
753 rnode_t *rp;
754
755 rp = VTOR(vp);
756 mutex_enter(&rp->r_statelock);
757 if (ATTRCACHE_VALID(vp)) {
758 /*
759 * Cached attributes are valid
760 */
761 *vap = rp->r_attr;
762 mutex_exit(&rp->r_statelock);
763 return (0);
764 }
765 mutex_exit(&rp->r_statelock);
766 return (1);
767 }
768
769 /*
770 * Get attributes over-the-wire and update attributes cache
771 * if no error occurred in the over-the-wire operation.
772 * Return 0 if successful, otherwise error.
773 */
774 int
775 nfs_getattr_otw(vnode_t *vp, struct vattr *vap, cred_t *cr)
776 {
777 int error;
778 struct nfsattrstat ns;
779 int douprintf;
780 mntinfo_t *mi;
781 failinfo_t fi;
782 hrtime_t t;
783
784 mi = VTOMI(vp);
785 fi.vp = vp;
786 fi.fhp = NULL; /* no need to update, filehandle not copied */
787 fi.copyproc = nfscopyfh;
788 fi.lookupproc = nfslookup;
789 fi.xattrdirproc = acl_getxattrdir2;
790
791 if (mi->mi_flags & MI_ACL) {
792 error = acl_getattr2_otw(vp, vap, cr);
793 if (mi->mi_flags & MI_ACL)
794 return (error);
795 }
796
797 douprintf = 1;
798
799 t = gethrtime();
800
801 error = rfs2call(mi, RFS_GETATTR,
802 xdr_fhandle, (caddr_t)VTOFH(vp),
803 xdr_attrstat, (caddr_t)&ns, cr,
804 &douprintf, &ns.ns_status, 0, &fi);
805
806 if (!error) {
807 error = geterrno(ns.ns_status);
808 if (!error)
809 error = nfs_cache_fattr(vp, &ns.ns_attr, vap, t, cr);
810 else {
811 PURGE_STALE_FH(error, vp, cr);
812 }
813 }
814
815 return (error);
816 }
817
818 /*
819 * Return either cached ot remote attributes. If get remote attr
820 * use them to check and invalidate caches, then cache the new attributes.
821 */
822 int
823 nfsgetattr(vnode_t *vp, struct vattr *vap, cred_t *cr)
824 {
825 int error;
826 rnode_t *rp;
827
828 /*
829 * If we've got cached attributes, we're done, otherwise go
830 * to the server to get attributes, which will update the cache
831 * in the process.
832 */
833 error = nfs_getattr_cache(vp, vap);
834 if (error)
835 error = nfs_getattr_otw(vp, vap, cr);
836
837 /* Return the client's view of file size */
838 rp = VTOR(vp);
839 mutex_enter(&rp->r_statelock);
840 vap->va_size = rp->r_size;
841 mutex_exit(&rp->r_statelock);
842
843 return (error);
844 }
845
846 /*
847 * Get attributes over-the-wire and update attributes cache
848 * if no error occurred in the over-the-wire operation.
849 * Return 0 if successful, otherwise error.
850 */
851 int
852 nfs3_getattr_otw(vnode_t *vp, struct vattr *vap, cred_t *cr)
853 {
854 int error;
855 GETATTR3args args;
856 GETATTR3vres res;
857 int douprintf;
858 failinfo_t fi;
859 hrtime_t t;
860
861 args.object = *VTOFH3(vp);
862 fi.vp = vp;
863 fi.fhp = (caddr_t)&args.object;
864 fi.copyproc = nfs3copyfh;
865 fi.lookupproc = nfs3lookup;
866 fi.xattrdirproc = acl_getxattrdir3;
867 res.fres.vp = vp;
868 res.fres.vap = vap;
869
870 douprintf = 1;
871
872 t = gethrtime();
873
874 error = rfs3call(VTOMI(vp), NFSPROC3_GETATTR,
875 xdr_nfs_fh3, (caddr_t)&args,
876 xdr_GETATTR3vres, (caddr_t)&res, cr,
877 &douprintf, &res.status, 0, &fi);
878
879 if (error)
880 return (error);
881
882 error = geterrno3(res.status);
883 if (error) {
884 PURGE_STALE_FH(error, vp, cr);
885 return (error);
886 }
887
888 /*
889 * Catch status codes that indicate fattr3 to vattr translation failure
890 */
891 if (res.fres.status)
892 return (res.fres.status);
893
894 nfs_attr_cache(vp, vap, t, cr);
895 return (0);
896 }
897
898 /*
899 * Return either cached or remote attributes. If get remote attr
900 * use them to check and invalidate caches, then cache the new attributes.
901 */
902 int
903 nfs3getattr(vnode_t *vp, struct vattr *vap, cred_t *cr)
904 {
905 int error;
906 rnode_t *rp;
907
908 /*
909 * If we've got cached attributes, we're done, otherwise go
910 * to the server to get attributes, which will update the cache
911 * in the process.
912 */
913 error = nfs_getattr_cache(vp, vap);
914 if (error)
915 error = nfs3_getattr_otw(vp, vap, cr);
916
917 /* Return the client's view of file size */
918 rp = VTOR(vp);
919 mutex_enter(&rp->r_statelock);
920 vap->va_size = rp->r_size;
921 mutex_exit(&rp->r_statelock);
922
923 return (error);
924 }
925
926 vtype_t nf_to_vt[] = {
927 VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK
928 };
929 /*
930 * Convert NFS Version 2 over the network attributes to the local
931 * virtual attributes. The mapping between the UID_NOBODY/GID_NOBODY
932 * network representation and the local representation is done here.
933 * Returns 0 for success, error if failed due to overflow.
934 */
935 int
936 nattr_to_vattr(vnode_t *vp, struct nfsfattr *na, struct vattr *vap)
937 {
938 /* overflow in time attributes? */
939 #ifndef _LP64
940 if (!NFS2_FATTR_TIME_OK(na))
941 return (EOVERFLOW);
942 #endif
943
944 if (na->na_type < NFNON || na->na_type > NFSOC)
945 vap->va_type = VBAD;
946 else
947 vap->va_type = nf_to_vt[na->na_type];
948 vap->va_mode = na->na_mode;
949 vap->va_uid = (na->na_uid == NFS_UID_NOBODY) ? UID_NOBODY : na->na_uid;
950 vap->va_gid = (na->na_gid == NFS_GID_NOBODY) ? GID_NOBODY : na->na_gid;
951 vap->va_fsid = vp->v_vfsp->vfs_dev;
952 vap->va_nodeid = na->na_nodeid;
953 vap->va_nlink = na->na_nlink;
954 vap->va_size = na->na_size; /* keep for cache validation */
955 /*
956 * nfs protocol defines times as unsigned so don't extend sign,
957 * unless sysadmin set nfs_allow_preepoch_time.
958 */
959 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, na->na_atime.tv_sec);
960 vap->va_atime.tv_nsec = (uint32_t)(na->na_atime.tv_usec * 1000);
961 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, na->na_mtime.tv_sec);
962 vap->va_mtime.tv_nsec = (uint32_t)(na->na_mtime.tv_usec * 1000);
963 NFS_TIME_T_CONVERT(vap->va_ctime.tv_sec, na->na_ctime.tv_sec);
964 vap->va_ctime.tv_nsec = (uint32_t)(na->na_ctime.tv_usec * 1000);
965 /*
966 * Shannon's law - uncompress the received dev_t
967 * if the top half of is zero indicating a response
968 * from an `older style' OS. Except for when it is a
969 * `new style' OS sending the maj device of zero,
970 * in which case the algorithm still works because the
971 * fact that it is a new style server
972 * is hidden by the minor device not being greater
973 * than 255 (a requirement in this case).
974 */
975 if ((na->na_rdev & 0xffff0000) == 0)
976 vap->va_rdev = nfsv2_expdev(na->na_rdev);
977 else
978 vap->va_rdev = expldev(na->na_rdev);
979
980 vap->va_nblocks = na->na_blocks;
981 switch (na->na_type) {
982 case NFBLK:
983 vap->va_blksize = DEV_BSIZE;
984 break;
985
986 case NFCHR:
987 vap->va_blksize = MAXBSIZE;
988 break;
989
990 case NFSOC:
991 default:
992 vap->va_blksize = na->na_blocksize;
993 break;
994 }
995 /*
996 * This bit of ugliness is a hack to preserve the
997 * over-the-wire protocols for named-pipe vnodes.
998 * It remaps the special over-the-wire type to the
999 * VFIFO type. (see note in nfs.h)
1000 */
1001 if (NA_ISFIFO(na)) {
1002 vap->va_type = VFIFO;
1003 vap->va_mode = (vap->va_mode & ~S_IFMT) | S_IFIFO;
1004 vap->va_rdev = 0;
1005 vap->va_blksize = na->na_blocksize;
1006 }
1007 vap->va_seq = 0;
1008 return (0);
1009 }
1010
1011 /*
1012 * Convert NFS Version 3 over the network attributes to the local
1013 * virtual attributes. The mapping between the UID_NOBODY/GID_NOBODY
1014 * network representation and the local representation is done here.
1015 */
1016 vtype_t nf3_to_vt[] = {
1017 VBAD, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO
1018 };
1019
1020 int
1021 fattr3_to_vattr(vnode_t *vp, fattr3 *na, struct vattr *vap)
1022 {
1023
1024 #ifndef _LP64
1025 /* overflow in time attributes? */
1026 if (!NFS3_FATTR_TIME_OK(na))
1027 return (EOVERFLOW);
1028 #endif
1029 if (!NFS3_SIZE_OK(na->size))
1030 /* file too big */
1031 return (EFBIG);
1032
1033 vap->va_mask = AT_ALL;
1034
1035 if (na->type < NF3REG || na->type > NF3FIFO)
1036 vap->va_type = VBAD;
1037 else
1038 vap->va_type = nf3_to_vt[na->type];
1039 vap->va_mode = na->mode;
1040 vap->va_uid = (na->uid == NFS_UID_NOBODY) ? UID_NOBODY : (uid_t)na->uid;
1041 vap->va_gid = (na->gid == NFS_GID_NOBODY) ? GID_NOBODY : (gid_t)na->gid;
1042 vap->va_fsid = vp->v_vfsp->vfs_dev;
1043 vap->va_nodeid = na->fileid;
1044 vap->va_nlink = na->nlink;
1045 vap->va_size = na->size;
1046
1047 /*
1048 * nfs protocol defines times as unsigned so don't extend sign,
1049 * unless sysadmin set nfs_allow_preepoch_time.
1050 */
1051 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, na->atime.seconds);
1052 vap->va_atime.tv_nsec = (uint32_t)na->atime.nseconds;
1053 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, na->mtime.seconds);
1054 vap->va_mtime.tv_nsec = (uint32_t)na->mtime.nseconds;
1055 NFS_TIME_T_CONVERT(vap->va_ctime.tv_sec, na->ctime.seconds);
1056 vap->va_ctime.tv_nsec = (uint32_t)na->ctime.nseconds;
1057
1058 switch (na->type) {
1059 case NF3BLK:
1060 vap->va_rdev = makedevice(na->rdev.specdata1,
1061 na->rdev.specdata2);
1062 vap->va_blksize = DEV_BSIZE;
1063 vap->va_nblocks = 0;
1064 break;
1065 case NF3CHR:
1066 vap->va_rdev = makedevice(na->rdev.specdata1,
1067 na->rdev.specdata2);
1068 vap->va_blksize = MAXBSIZE;
1069 vap->va_nblocks = 0;
1070 break;
1071 case NF3REG:
1072 case NF3DIR:
1073 case NF3LNK:
1074 vap->va_rdev = 0;
1075 vap->va_blksize = MAXBSIZE;
1076 vap->va_nblocks = (u_longlong_t)
1077 ((na->used + (size3)DEV_BSIZE - (size3)1) /
1078 (size3)DEV_BSIZE);
1079 break;
1080 case NF3SOCK:
1081 case NF3FIFO:
1082 default:
1083 vap->va_rdev = 0;
1084 vap->va_blksize = MAXBSIZE;
1085 vap->va_nblocks = 0;
1086 break;
1087 }
1088 vap->va_seq = 0;
1089 return (0);
1090 }
1091
1092 /*
1093 * Asynchronous I/O parameters. nfs_async_threads is the high-water mark
1094 * for the demand-based allocation of async threads per-mount. The
1095 * nfs_async_timeout is the amount of time a thread will live after it
1096 * becomes idle, unless new I/O requests are received before the thread
1097 * dies. See nfs_async_putpage and nfs_async_start.
1098 */
1099
1100 int nfs_async_timeout = -1; /* uninitialized */
1101
1102 static void nfs_async_start(struct vfs *);
1103
1104 static void
1105 free_async_args(struct nfs_async_reqs *args)
1106 {
1107 rnode_t *rp;
1108
1109 if (args->a_io != NFS_INACTIVE) {
1110 rp = VTOR(args->a_vp);
1111 mutex_enter(&rp->r_statelock);
1112 rp->r_count--;
1113 if (args->a_io == NFS_PUTAPAGE ||
1114 args->a_io == NFS_PAGEIO)
1115 rp->r_awcount--;
1116 cv_broadcast(&rp->r_cv);
1117 mutex_exit(&rp->r_statelock);
1118 VN_RELE(args->a_vp);
1119 }
1120 crfree(args->a_cred);
1121 kmem_free(args, sizeof (*args));
1122 }
1123
1124 /*
1125 * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and
1126 * pageout(), running in the global zone, have legitimate reasons to do
1127 * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts. We avoid the problem by
1128 * use of a a per-mount "asynchronous requests manager thread" which is
1129 * signaled by the various asynchronous work routines when there is
1130 * asynchronous work to be done. It is responsible for creating new
1131 * worker threads if necessary, and notifying existing worker threads
1132 * that there is work to be done.
1133 *
1134 * In other words, it will "take the specifications from the customers and
1135 * give them to the engineers."
1136 *
1137 * Worker threads die off of their own accord if they are no longer
1138 * needed.
1139 *
1140 * This thread is killed when the zone is going away or the filesystem
1141 * is being unmounted.
1142 */
1143 void
1144 nfs_async_manager(vfs_t *vfsp)
1145 {
1146 callb_cpr_t cprinfo;
1147 mntinfo_t *mi;
1148 uint_t max_threads;
1149
1150 mi = VFTOMI(vfsp);
1151
1152 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr,
1153 "nfs_async_manager");
1154
1155 mutex_enter(&mi->mi_async_lock);
1156 /*
1157 * We want to stash the max number of threads that this mount was
1158 * allowed so we can use it later when the variable is set to zero as
1159 * part of the zone/mount going away.
1160 *
1161 * We want to be able to create at least one thread to handle
1162 * asyncrhonous inactive calls.
1163 */
1164 max_threads = MAX(mi->mi_max_threads, 1);
1165 mutex_enter(&mi->mi_lock);
1166 /*
1167 * We don't want to wait for mi_max_threads to go to zero, since that
1168 * happens as part of a failed unmount, but this thread should only
1169 * exit when the mount/zone is really going away.
1170 *
1171 * Once MI_ASYNC_MGR_STOP is set, no more async operations will be
1172 * attempted: the various _async_*() functions know to do things
1173 * inline if mi_max_threads == 0. Henceforth we just drain out the
1174 * outstanding requests.
1175 *
1176 * Note that we still create zthreads even if we notice the zone is
1177 * shutting down (MI_ASYNC_MGR_STOP is set); this may cause the zone
1178 * shutdown sequence to take slightly longer in some cases, but
1179 * doesn't violate the protocol, as all threads will exit as soon as
1180 * they're done processing the remaining requests.
1181 */
1182 while (!(mi->mi_flags & MI_ASYNC_MGR_STOP) ||
1183 mi->mi_async_req_count > 0) {
1184 mutex_exit(&mi->mi_lock);
1185 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1186 cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock);
1187 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1188 while (mi->mi_async_req_count > 0) {
1189 /*
1190 * Paranoia: If the mount started out having
1191 * (mi->mi_max_threads == 0), and the value was
1192 * later changed (via a debugger or somesuch),
1193 * we could be confused since we will think we
1194 * can't create any threads, and the calling
1195 * code (which looks at the current value of
1196 * mi->mi_max_threads, now non-zero) thinks we
1197 * can.
1198 *
1199 * So, because we're paranoid, we create threads
1200 * up to the maximum of the original and the
1201 * current value. This means that future
1202 * (debugger-induced) lowerings of
1203 * mi->mi_max_threads are ignored for our
1204 * purposes, but who told them they could change
1205 * random values on a live kernel anyhow?
1206 */
1207 if (mi->mi_threads <
1208 MAX(mi->mi_max_threads, max_threads)) {
1209 mi->mi_threads++;
1210 mutex_exit(&mi->mi_async_lock);
1211 VFS_HOLD(vfsp); /* hold for new thread */
1212 (void) zthread_create(NULL, 0, nfs_async_start,
1213 vfsp, 0, minclsyspri);
1214 mutex_enter(&mi->mi_async_lock);
1215 }
1216 cv_signal(&mi->mi_async_work_cv);
1217 ASSERT(mi->mi_async_req_count != 0);
1218 mi->mi_async_req_count--;
1219 }
1220 mutex_enter(&mi->mi_lock);
1221 }
1222 mutex_exit(&mi->mi_lock);
1223 /*
1224 * Let everyone know we're done.
1225 */
1226 mi->mi_manager_thread = NULL;
1227 cv_broadcast(&mi->mi_async_cv);
1228
1229 /*
1230 * There is no explicit call to mutex_exit(&mi->mi_async_lock)
1231 * since CALLB_CPR_EXIT is actually responsible for releasing
1232 * 'mi_async_lock'.
1233 */
1234 CALLB_CPR_EXIT(&cprinfo);
1235 VFS_RELE(vfsp); /* release thread's hold */
1236 zthread_exit();
1237 }
1238
1239 /*
1240 * Signal (and wait for) the async manager thread to clean up and go away.
1241 */
1242 void
1243 nfs_async_manager_stop(vfs_t *vfsp)
1244 {
1245 mntinfo_t *mi = VFTOMI(vfsp);
1246
1247 mutex_enter(&mi->mi_async_lock);
1248 mutex_enter(&mi->mi_lock);
1249 mi->mi_flags |= MI_ASYNC_MGR_STOP;
1250 mutex_exit(&mi->mi_lock);
1251 cv_broadcast(&mi->mi_async_reqs_cv);
1252 while (mi->mi_manager_thread != NULL)
1253 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
1254 mutex_exit(&mi->mi_async_lock);
1255 }
1256
1257 int
1258 nfs_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr,
1259 struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *,
1260 u_offset_t, caddr_t, struct seg *, cred_t *))
1261 {
1262 rnode_t *rp;
1263 mntinfo_t *mi;
1264 struct nfs_async_reqs *args;
1265
1266 rp = VTOR(vp);
1267 ASSERT(rp->r_freef == NULL);
1268
1269 mi = VTOMI(vp);
1270
1271 /*
1272 * If addr falls in a different segment, don't bother doing readahead.
1273 */
1274 if (addr >= seg->s_base + seg->s_size)
1275 return (-1);
1276
1277 /*
1278 * If we can't allocate a request structure, punt on the readahead.
1279 */
1280 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1281 return (-1);
1282
1283 /*
1284 * If a lock operation is pending, don't initiate any new
1285 * readaheads. Otherwise, bump r_count to indicate the new
1286 * asynchronous I/O.
1287 */
1288 if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) {
1289 kmem_free(args, sizeof (*args));
1290 return (-1);
1291 }
1292 mutex_enter(&rp->r_statelock);
1293 rp->r_count++;
1294 mutex_exit(&rp->r_statelock);
1295 nfs_rw_exit(&rp->r_lkserlock);
1296
1297 args->a_next = NULL;
1298 #ifdef DEBUG
1299 args->a_queuer = curthread;
1300 #endif
1301 VN_HOLD(vp);
1302 args->a_vp = vp;
1303 ASSERT(cr != NULL);
1304 crhold(cr);
1305 args->a_cred = cr;
1306 args->a_io = NFS_READ_AHEAD;
1307 args->a_nfs_readahead = readahead;
1308 args->a_nfs_blkoff = blkoff;
1309 args->a_nfs_seg = seg;
1310 args->a_nfs_addr = addr;
1311
1312 mutex_enter(&mi->mi_async_lock);
1313
1314 /*
1315 * If asyncio has been disabled, don't bother readahead.
1316 */
1317 if (mi->mi_max_threads == 0) {
1318 mutex_exit(&mi->mi_async_lock);
1319 goto noasync;
1320 }
1321
1322 /*
1323 * Link request structure into the async list and
1324 * wakeup async thread to do the i/o.
1325 */
1326 if (mi->mi_async_reqs[NFS_READ_AHEAD] == NULL) {
1327 mi->mi_async_reqs[NFS_READ_AHEAD] = args;
1328 mi->mi_async_tail[NFS_READ_AHEAD] = args;
1329 } else {
1330 mi->mi_async_tail[NFS_READ_AHEAD]->a_next = args;
1331 mi->mi_async_tail[NFS_READ_AHEAD] = args;
1332 }
1333
1334 if (mi->mi_io_kstats) {
1335 mutex_enter(&mi->mi_lock);
1336 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1337 mutex_exit(&mi->mi_lock);
1338 }
1339
1340 mi->mi_async_req_count++;
1341 ASSERT(mi->mi_async_req_count != 0);
1342 cv_signal(&mi->mi_async_reqs_cv);
1343 mutex_exit(&mi->mi_async_lock);
1344 return (0);
1345
1346 noasync:
1347 mutex_enter(&rp->r_statelock);
1348 rp->r_count--;
1349 cv_broadcast(&rp->r_cv);
1350 mutex_exit(&rp->r_statelock);
1351 VN_RELE(vp);
1352 crfree(cr);
1353 kmem_free(args, sizeof (*args));
1354 return (-1);
1355 }
1356
1357 int
1358 nfs_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
1359 int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *,
1360 u_offset_t, size_t, int, cred_t *))
1361 {
1362 rnode_t *rp;
1363 mntinfo_t *mi;
1364 struct nfs_async_reqs *args;
1365
1366 ASSERT(flags & B_ASYNC);
1367 ASSERT(vp->v_vfsp != NULL);
1368
1369 rp = VTOR(vp);
1370 ASSERT(rp->r_count > 0);
1371
1372 mi = VTOMI(vp);
1373
1374 /*
1375 * If we can't allocate a request structure, do the putpage
1376 * operation synchronously in this thread's context.
1377 */
1378 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1379 goto noasync;
1380
1381 args->a_next = NULL;
1382 #ifdef DEBUG
1383 args->a_queuer = curthread;
1384 #endif
1385 VN_HOLD(vp);
1386 args->a_vp = vp;
1387 ASSERT(cr != NULL);
1388 crhold(cr);
1389 args->a_cred = cr;
1390 args->a_io = NFS_PUTAPAGE;
1391 args->a_nfs_putapage = putapage;
1392 args->a_nfs_pp = pp;
1393 args->a_nfs_off = off;
1394 args->a_nfs_len = (uint_t)len;
1395 args->a_nfs_flags = flags;
1396
1397 mutex_enter(&mi->mi_async_lock);
1398
1399 /*
1400 * If asyncio has been disabled, then make a synchronous request.
1401 * This check is done a second time in case async io was diabled
1402 * while this thread was blocked waiting for memory pressure to
1403 * reduce or for the queue to drain.
1404 */
1405 if (mi->mi_max_threads == 0) {
1406 mutex_exit(&mi->mi_async_lock);
1407 goto noasync;
1408 }
1409
1410 /*
1411 * Link request structure into the async list and
1412 * wakeup async thread to do the i/o.
1413 */
1414 if (mi->mi_async_reqs[NFS_PUTAPAGE] == NULL) {
1415 mi->mi_async_reqs[NFS_PUTAPAGE] = args;
1416 mi->mi_async_tail[NFS_PUTAPAGE] = args;
1417 } else {
1418 mi->mi_async_tail[NFS_PUTAPAGE]->a_next = args;
1419 mi->mi_async_tail[NFS_PUTAPAGE] = args;
1420 }
1421
1422 mutex_enter(&rp->r_statelock);
1423 rp->r_count++;
1424 rp->r_awcount++;
1425 mutex_exit(&rp->r_statelock);
1426
1427 if (mi->mi_io_kstats) {
1428 mutex_enter(&mi->mi_lock);
1429 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1430 mutex_exit(&mi->mi_lock);
1431 }
1432
1433 mi->mi_async_req_count++;
1434 ASSERT(mi->mi_async_req_count != 0);
1435 cv_signal(&mi->mi_async_reqs_cv);
1436 mutex_exit(&mi->mi_async_lock);
1437 return (0);
1438
1439 noasync:
1440 if (args != NULL) {
1441 VN_RELE(vp);
1442 crfree(cr);
1443 kmem_free(args, sizeof (*args));
1444 }
1445
1446 if (curproc == proc_pageout || curproc == proc_fsflush) {
1447 /*
1448 * If we get here in the context of the pageout/fsflush,
1449 * we refuse to do a sync write, because this may hang
1450 * pageout (and the machine). In this case, we just
1451 * re-mark the page as dirty and punt on the page.
1452 *
1453 * Make sure B_FORCE isn't set. We can re-mark the
1454 * pages as dirty and unlock the pages in one swoop by
1455 * passing in B_ERROR to pvn_write_done(). However,
1456 * we should make sure B_FORCE isn't set - we don't
1457 * want the page tossed before it gets written out.
1458 */
1459 if (flags & B_FORCE)
1460 flags &= ~(B_INVAL | B_FORCE);
1461 pvn_write_done(pp, flags | B_ERROR);
1462 return (0);
1463 }
1464 if (nfs_zone() != mi->mi_zone) {
1465 /*
1466 * So this was a cross-zone sync putpage. We pass in B_ERROR
1467 * to pvn_write_done() to re-mark the pages as dirty and unlock
1468 * them.
1469 *
1470 * We don't want to clear B_FORCE here as the caller presumably
1471 * knows what they're doing if they set it.
1472 */
1473 pvn_write_done(pp, flags | B_ERROR);
1474 return (EPERM);
1475 }
1476 return ((*putapage)(vp, pp, off, len, flags, cr));
1477 }
1478
1479 int
1480 nfs_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
1481 int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t,
1482 size_t, int, cred_t *))
1483 {
1484 rnode_t *rp;
1485 mntinfo_t *mi;
1486 struct nfs_async_reqs *args;
1487
1488 ASSERT(flags & B_ASYNC);
1489 ASSERT(vp->v_vfsp != NULL);
1490
1491 rp = VTOR(vp);
1492 ASSERT(rp->r_count > 0);
1493
1494 mi = VTOMI(vp);
1495
1496 /*
1497 * If we can't allocate a request structure, do the pageio
1498 * request synchronously in this thread's context.
1499 */
1500 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1501 goto noasync;
1502
1503 args->a_next = NULL;
1504 #ifdef DEBUG
1505 args->a_queuer = curthread;
1506 #endif
1507 VN_HOLD(vp);
1508 args->a_vp = vp;
1509 ASSERT(cr != NULL);
1510 crhold(cr);
1511 args->a_cred = cr;
1512 args->a_io = NFS_PAGEIO;
1513 args->a_nfs_pageio = pageio;
1514 args->a_nfs_pp = pp;
1515 args->a_nfs_off = io_off;
1516 args->a_nfs_len = (uint_t)io_len;
1517 args->a_nfs_flags = flags;
1518
1519 mutex_enter(&mi->mi_async_lock);
1520
1521 /*
1522 * If asyncio has been disabled, then make a synchronous request.
1523 * This check is done a second time in case async io was diabled
1524 * while this thread was blocked waiting for memory pressure to
1525 * reduce or for the queue to drain.
1526 */
1527 if (mi->mi_max_threads == 0) {
1528 mutex_exit(&mi->mi_async_lock);
1529 goto noasync;
1530 }
1531
1532 /*
1533 * Link request structure into the async list and
1534 * wakeup async thread to do the i/o.
1535 */
1536 if (mi->mi_async_reqs[NFS_PAGEIO] == NULL) {
1537 mi->mi_async_reqs[NFS_PAGEIO] = args;
1538 mi->mi_async_tail[NFS_PAGEIO] = args;
1539 } else {
1540 mi->mi_async_tail[NFS_PAGEIO]->a_next = args;
1541 mi->mi_async_tail[NFS_PAGEIO] = args;
1542 }
1543
1544 mutex_enter(&rp->r_statelock);
1545 rp->r_count++;
1546 rp->r_awcount++;
1547 mutex_exit(&rp->r_statelock);
1548
1549 if (mi->mi_io_kstats) {
1550 mutex_enter(&mi->mi_lock);
1551 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1552 mutex_exit(&mi->mi_lock);
1553 }
1554
1555 mi->mi_async_req_count++;
1556 ASSERT(mi->mi_async_req_count != 0);
1557 cv_signal(&mi->mi_async_reqs_cv);
1558 mutex_exit(&mi->mi_async_lock);
1559 return (0);
1560
1561 noasync:
1562 if (args != NULL) {
1563 VN_RELE(vp);
1564 crfree(cr);
1565 kmem_free(args, sizeof (*args));
1566 }
1567
1568 /*
1569 * If we can't do it ASYNC, for reads we do nothing (but cleanup
1570 * the page list), for writes we do it synchronously, except for
1571 * proc_pageout/proc_fsflush as described below.
1572 */
1573 if (flags & B_READ) {
1574 pvn_read_done(pp, flags | B_ERROR);
1575 return (0);
1576 }
1577
1578 if (curproc == proc_pageout || curproc == proc_fsflush) {
1579 /*
1580 * If we get here in the context of the pageout/fsflush,
1581 * we refuse to do a sync write, because this may hang
1582 * pageout/fsflush (and the machine). In this case, we just
1583 * re-mark the page as dirty and punt on the page.
1584 *
1585 * Make sure B_FORCE isn't set. We can re-mark the
1586 * pages as dirty and unlock the pages in one swoop by
1587 * passing in B_ERROR to pvn_write_done(). However,
1588 * we should make sure B_FORCE isn't set - we don't
1589 * want the page tossed before it gets written out.
1590 */
1591 if (flags & B_FORCE)
1592 flags &= ~(B_INVAL | B_FORCE);
1593 pvn_write_done(pp, flags | B_ERROR);
1594 return (0);
1595 }
1596
1597 if (nfs_zone() != mi->mi_zone) {
1598 /*
1599 * So this was a cross-zone sync pageio. We pass in B_ERROR
1600 * to pvn_write_done() to re-mark the pages as dirty and unlock
1601 * them.
1602 *
1603 * We don't want to clear B_FORCE here as the caller presumably
1604 * knows what they're doing if they set it.
1605 */
1606 pvn_write_done(pp, flags | B_ERROR);
1607 return (EPERM);
1608 }
1609 return ((*pageio)(vp, pp, io_off, io_len, flags, cr));
1610 }
1611
1612 void
1613 nfs_async_readdir(vnode_t *vp, rddir_cache *rdc, cred_t *cr,
1614 int (*readdir)(vnode_t *, rddir_cache *, cred_t *))
1615 {
1616 rnode_t *rp;
1617 mntinfo_t *mi;
1618 struct nfs_async_reqs *args;
1619
1620 rp = VTOR(vp);
1621 ASSERT(rp->r_freef == NULL);
1622
1623 mi = VTOMI(vp);
1624
1625 /*
1626 * If we can't allocate a request structure, do the readdir
1627 * operation synchronously in this thread's context.
1628 */
1629 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1630 goto noasync;
1631
1632 args->a_next = NULL;
1633 #ifdef DEBUG
1634 args->a_queuer = curthread;
1635 #endif
1636 VN_HOLD(vp);
1637 args->a_vp = vp;
1638 ASSERT(cr != NULL);
1639 crhold(cr);
1640 args->a_cred = cr;
1641 args->a_io = NFS_READDIR;
1642 args->a_nfs_readdir = readdir;
1643 args->a_nfs_rdc = rdc;
1644
1645 mutex_enter(&mi->mi_async_lock);
1646
1647 /*
1648 * If asyncio has been disabled, then make a synchronous request.
1649 */
1650 if (mi->mi_max_threads == 0) {
1651 mutex_exit(&mi->mi_async_lock);
1652 goto noasync;
1653 }
1654
1655 /*
1656 * Link request structure into the async list and
1657 * wakeup async thread to do the i/o.
1658 */
1659 if (mi->mi_async_reqs[NFS_READDIR] == NULL) {
1660 mi->mi_async_reqs[NFS_READDIR] = args;
1661 mi->mi_async_tail[NFS_READDIR] = args;
1662 } else {
1663 mi->mi_async_tail[NFS_READDIR]->a_next = args;
1664 mi->mi_async_tail[NFS_READDIR] = args;
1665 }
1666
1667 mutex_enter(&rp->r_statelock);
1668 rp->r_count++;
1669 mutex_exit(&rp->r_statelock);
1670
1671 if (mi->mi_io_kstats) {
1672 mutex_enter(&mi->mi_lock);
1673 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1674 mutex_exit(&mi->mi_lock);
1675 }
1676
1677 mi->mi_async_req_count++;
1678 ASSERT(mi->mi_async_req_count != 0);
1679 cv_signal(&mi->mi_async_reqs_cv);
1680 mutex_exit(&mi->mi_async_lock);
1681 return;
1682
1683 noasync:
1684 if (args != NULL) {
1685 VN_RELE(vp);
1686 crfree(cr);
1687 kmem_free(args, sizeof (*args));
1688 }
1689
1690 rdc->entries = NULL;
1691 mutex_enter(&rp->r_statelock);
1692 ASSERT(rdc->flags & RDDIR);
1693 rdc->flags &= ~RDDIR;
1694 rdc->flags |= RDDIRREQ;
1695 /*
1696 * Check the flag to see if RDDIRWAIT is set. If RDDIRWAIT
1697 * is set, wakeup the thread sleeping in cv_wait_sig().
1698 * The woken up thread will reset the flag to RDDIR and will
1699 * continue with the readdir opeartion.
1700 */
1701 if (rdc->flags & RDDIRWAIT) {
1702 rdc->flags &= ~RDDIRWAIT;
1703 cv_broadcast(&rdc->cv);
1704 }
1705 mutex_exit(&rp->r_statelock);
1706 rddir_cache_rele(rdc);
1707 }
1708
1709 void
1710 nfs_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
1711 cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3,
1712 cred_t *))
1713 {
1714 rnode_t *rp;
1715 mntinfo_t *mi;
1716 struct nfs_async_reqs *args;
1717 page_t *pp;
1718
1719 rp = VTOR(vp);
1720 mi = VTOMI(vp);
1721
1722 /*
1723 * If we can't allocate a request structure, do the commit
1724 * operation synchronously in this thread's context.
1725 */
1726 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL)
1727 goto noasync;
1728
1729 args->a_next = NULL;
1730 #ifdef DEBUG
1731 args->a_queuer = curthread;
1732 #endif
1733 VN_HOLD(vp);
1734 args->a_vp = vp;
1735 ASSERT(cr != NULL);
1736 crhold(cr);
1737 args->a_cred = cr;
1738 args->a_io = NFS_COMMIT;
1739 args->a_nfs_commit = commit;
1740 args->a_nfs_plist = plist;
1741 args->a_nfs_offset = offset;
1742 args->a_nfs_count = count;
1743
1744 mutex_enter(&mi->mi_async_lock);
1745
1746 /*
1747 * If asyncio has been disabled, then make a synchronous request.
1748 * This check is done a second time in case async io was diabled
1749 * while this thread was blocked waiting for memory pressure to
1750 * reduce or for the queue to drain.
1751 */
1752 if (mi->mi_max_threads == 0) {
1753 mutex_exit(&mi->mi_async_lock);
1754 goto noasync;
1755 }
1756
1757 /*
1758 * Link request structure into the async list and
1759 * wakeup async thread to do the i/o.
1760 */
1761 if (mi->mi_async_reqs[NFS_COMMIT] == NULL) {
1762 mi->mi_async_reqs[NFS_COMMIT] = args;
1763 mi->mi_async_tail[NFS_COMMIT] = args;
1764 } else {
1765 mi->mi_async_tail[NFS_COMMIT]->a_next = args;
1766 mi->mi_async_tail[NFS_COMMIT] = args;
1767 }
1768
1769 mutex_enter(&rp->r_statelock);
1770 rp->r_count++;
1771 mutex_exit(&rp->r_statelock);
1772
1773 if (mi->mi_io_kstats) {
1774 mutex_enter(&mi->mi_lock);
1775 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
1776 mutex_exit(&mi->mi_lock);
1777 }
1778
1779 mi->mi_async_req_count++;
1780 ASSERT(mi->mi_async_req_count != 0);
1781 cv_signal(&mi->mi_async_reqs_cv);
1782 mutex_exit(&mi->mi_async_lock);
1783 return;
1784
1785 noasync:
1786 if (args != NULL) {
1787 VN_RELE(vp);
1788 crfree(cr);
1789 kmem_free(args, sizeof (*args));
1790 }
1791
1792 if (curproc == proc_pageout || curproc == proc_fsflush ||
1793 nfs_zone() != mi->mi_zone) {
1794 while (plist != NULL) {
1795 pp = plist;
1796 page_sub(&plist, pp);
1797 pp->p_fsdata = C_COMMIT;
1798 page_unlock(pp);
1799 }
1800 return;
1801 }
1802 (*commit)(vp, plist, offset, count, cr);
1803 }
1804
1805 void
1806 nfs_async_inactive(vnode_t *vp, cred_t *cr,
1807 void (*inactive)(vnode_t *, cred_t *))
1808 {
1809 mntinfo_t *mi;
1810 struct nfs_async_reqs *args;
1811
1812 mi = VTOMI(vp);
1813
1814 args = kmem_alloc(sizeof (*args), KM_SLEEP);
1815 args->a_next = NULL;
1816 #ifdef DEBUG
1817 args->a_queuer = curthread;
1818 #endif
1819 args->a_vp = vp;
1820 ASSERT(cr != NULL);
1821 crhold(cr);
1822 args->a_cred = cr;
1823 args->a_io = NFS_INACTIVE;
1824 args->a_nfs_inactive = inactive;
1825
1826 /*
1827 * Note that we don't check mi->mi_max_threads here, since we
1828 * *need* to get rid of this vnode regardless of whether someone
1829 * set nfs3_max_threads/nfs_max_threads to zero in /etc/system.
1830 *
1831 * The manager thread knows about this and is willing to create
1832 * at least one thread to accomodate us.
1833 */
1834 mutex_enter(&mi->mi_async_lock);
1835 if (mi->mi_manager_thread == NULL) {
1836 rnode_t *rp = VTOR(vp);
1837
1838 mutex_exit(&mi->mi_async_lock);
1839 crfree(cr); /* drop our reference */
1840 kmem_free(args, sizeof (*args));
1841 /*
1842 * We can't do an over-the-wire call since we're in the wrong
1843 * zone, so we need to clean up state as best we can and then
1844 * throw away the vnode.
1845 */
1846 mutex_enter(&rp->r_statelock);
1847 if (rp->r_unldvp != NULL) {
1848 vnode_t *unldvp;
1849 char *unlname;
1850 cred_t *unlcred;
1851
1852 unldvp = rp->r_unldvp;
1853 rp->r_unldvp = NULL;
1854 unlname = rp->r_unlname;
1855 rp->r_unlname = NULL;
1856 unlcred = rp->r_unlcred;
1857 rp->r_unlcred = NULL;
1858 mutex_exit(&rp->r_statelock);
1859
1860 VN_RELE(unldvp);
1861 kmem_free(unlname, MAXNAMELEN);
1862 crfree(unlcred);
1863 } else {
1864 mutex_exit(&rp->r_statelock);
1865 }
1866 /*
1867 * No need to explicitly throw away any cached pages. The
1868 * eventual rinactive() will attempt a synchronous
1869 * VOP_PUTPAGE() which will immediately fail since the request
1870 * is coming from the wrong zone, and then will proceed to call
1871 * nfs_invalidate_pages() which will clean things up for us.
1872 */
1873 rp_addfree(VTOR(vp), cr);
1874 return;
1875 }
1876
1877 if (mi->mi_async_reqs[NFS_INACTIVE] == NULL) {
1878 mi->mi_async_reqs[NFS_INACTIVE] = args;
1879 } else {
1880 mi->mi_async_tail[NFS_INACTIVE]->a_next = args;
1881 }
1882 mi->mi_async_tail[NFS_INACTIVE] = args;
1883 /*
1884 * Don't increment r_count, since we're trying to get rid of the vnode.
1885 */
1886
1887 mi->mi_async_req_count++;
1888 ASSERT(mi->mi_async_req_count != 0);
1889 cv_signal(&mi->mi_async_reqs_cv);
1890 mutex_exit(&mi->mi_async_lock);
1891 }
1892
1893 /*
1894 * The async queues for each mounted file system are arranged as a
1895 * set of queues, one for each async i/o type. Requests are taken
1896 * from the queues in a round-robin fashion. A number of consecutive
1897 * requests are taken from each queue before moving on to the next
1898 * queue. This functionality may allow the NFS Version 2 server to do
1899 * write clustering, even if the client is mixing writes and reads
1900 * because it will take multiple write requests from the queue
1901 * before processing any of the other async i/o types.
1902 *
1903 * XXX The nfs_async_start thread is unsafe in the light of the present
1904 * model defined by cpr to suspend the system. Specifically over the
1905 * wire calls are cpr-unsafe. The thread should be reevaluated in
1906 * case of future updates to the cpr model.
1907 */
1908 static void
1909 nfs_async_start(struct vfs *vfsp)
1910 {
1911 struct nfs_async_reqs *args;
1912 mntinfo_t *mi = VFTOMI(vfsp);
1913 clock_t time_left = 1;
1914 callb_cpr_t cprinfo;
1915 int i;
1916
1917 /*
1918 * Dynamic initialization of nfs_async_timeout to allow nfs to be
1919 * built in an implementation independent manner.
1920 */
1921 if (nfs_async_timeout == -1)
1922 nfs_async_timeout = NFS_ASYNC_TIMEOUT;
1923
1924 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas");
1925
1926 mutex_enter(&mi->mi_async_lock);
1927 for (;;) {
1928 /*
1929 * Find the next queue containing an entry. We start
1930 * at the current queue pointer and then round robin
1931 * through all of them until we either find a non-empty
1932 * queue or have looked through all of them.
1933 */
1934 for (i = 0; i < NFS_ASYNC_TYPES; i++) {
1935 args = *mi->mi_async_curr;
1936 if (args != NULL)
1937 break;
1938 mi->mi_async_curr++;
1939 if (mi->mi_async_curr ==
1940 &mi->mi_async_reqs[NFS_ASYNC_TYPES])
1941 mi->mi_async_curr = &mi->mi_async_reqs[0];
1942 }
1943 /*
1944 * If we didn't find a entry, then block until woken up
1945 * again and then look through the queues again.
1946 */
1947 if (args == NULL) {
1948 /*
1949 * Exiting is considered to be safe for CPR as well
1950 */
1951 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1952
1953 /*
1954 * Wakeup thread waiting to unmount the file
1955 * system only if all async threads are inactive.
1956 *
1957 * If we've timed-out and there's nothing to do,
1958 * then get rid of this thread.
1959 */
1960 if (mi->mi_max_threads == 0 || time_left <= 0) {
1961 if (--mi->mi_threads == 0)
1962 cv_signal(&mi->mi_async_cv);
1963 CALLB_CPR_EXIT(&cprinfo);
1964 VFS_RELE(vfsp); /* release thread's hold */
1965 zthread_exit();
1966 /* NOTREACHED */
1967 }
1968 time_left = cv_timedwait(&mi->mi_async_work_cv,
1969 &mi->mi_async_lock, nfs_async_timeout + lbolt);
1970
1971 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock);
1972
1973 continue;
1974 }
1975 time_left = 1;
1976
1977 /*
1978 * Remove the request from the async queue and then
1979 * update the current async request queue pointer. If
1980 * the current queue is empty or we have removed enough
1981 * consecutive entries from it, then reset the counter
1982 * for this queue and then move the current pointer to
1983 * the next queue.
1984 */
1985 *mi->mi_async_curr = args->a_next;
1986 if (*mi->mi_async_curr == NULL ||
1987 --mi->mi_async_clusters[args->a_io] == 0) {
1988 mi->mi_async_clusters[args->a_io] =
1989 mi->mi_async_init_clusters;
1990 mi->mi_async_curr++;
1991 if (mi->mi_async_curr ==
1992 &mi->mi_async_reqs[NFS_ASYNC_TYPES])
1993 mi->mi_async_curr = &mi->mi_async_reqs[0];
1994 }
1995
1996 if (args->a_io != NFS_INACTIVE && mi->mi_io_kstats) {
1997 mutex_enter(&mi->mi_lock);
1998 kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
1999 mutex_exit(&mi->mi_lock);
2000 }
2001
2002 mutex_exit(&mi->mi_async_lock);
2003
2004 /*
2005 * Obtain arguments from the async request structure.
2006 */
2007 if (args->a_io == NFS_READ_AHEAD && mi->mi_max_threads > 0) {
2008 (*args->a_nfs_readahead)(args->a_vp, args->a_nfs_blkoff,
2009 args->a_nfs_addr, args->a_nfs_seg,
2010 args->a_cred);
2011 } else if (args->a_io == NFS_PUTAPAGE) {
2012 (void) (*args->a_nfs_putapage)(args->a_vp,
2013 args->a_nfs_pp, args->a_nfs_off,
2014 args->a_nfs_len, args->a_nfs_flags,
2015 args->a_cred);
2016 } else if (args->a_io == NFS_PAGEIO) {
2017 (void) (*args->a_nfs_pageio)(args->a_vp,
2018 args->a_nfs_pp, args->a_nfs_off,
2019 args->a_nfs_len, args->a_nfs_flags,
2020 args->a_cred);
2021 } else if (args->a_io == NFS_READDIR) {
2022 (void) ((*args->a_nfs_readdir)(args->a_vp,
2023 args->a_nfs_rdc, args->a_cred));
2024 } else if (args->a_io == NFS_COMMIT) {
2025 (*args->a_nfs_commit)(args->a_vp, args->a_nfs_plist,
2026 args->a_nfs_offset, args->a_nfs_count,
2027 args->a_cred);
2028 } else if (args->a_io == NFS_INACTIVE) {
2029 (*args->a_nfs_inactive)(args->a_vp, args->a_cred);
2030 }
2031
2032 /*
2033 * Now, release the vnode and free the credentials
2034 * structure.
2035 */
2036 free_async_args(args);
2037 /*
2038 * Reacquire the mutex because it will be needed above.
2039 */
2040 mutex_enter(&mi->mi_async_lock);
2041 }
2042 }
2043
2044 void
2045 nfs_async_stop(struct vfs *vfsp)
2046 {
2047 mntinfo_t *mi = VFTOMI(vfsp);
2048
2049 /*
2050 * Wait for all outstanding async operations to complete and for the
2051 * worker threads to exit.
2052 */
2053 mutex_enter(&mi->mi_async_lock);
2054 mi->mi_max_threads = 0;
2055 cv_broadcast(&mi->mi_async_work_cv);
2056 while (mi->mi_threads != 0)
2057 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock);
2058 mutex_exit(&mi->mi_async_lock);
2059 }
2060
2061 /*
2062 * nfs_async_stop_sig:
2063 * Wait for all outstanding putpage operation to complete. If a signal
2064 * is deliver we will abort and return non-zero. If we can put all the
2065 * pages we will return 0. This routine is called from nfs_unmount and
2066 * nfs3_unmount to make these operations interruptable.
2067 */
2068 int
2069 nfs_async_stop_sig(struct vfs *vfsp)
2070 {
2071 mntinfo_t *mi = VFTOMI(vfsp);
2072 ushort_t omax;
2073 int rval;
2074
2075 /*
2076 * Wait for all outstanding async operations to complete and for the
2077 * worker threads to exit.
2078 */
2079 mutex_enter(&mi->mi_async_lock);
2080 omax = mi->mi_max_threads;
2081 mi->mi_max_threads = 0;
2082 /*
2083 * Tell all the worker threads to exit.
2084 */
2085 cv_broadcast(&mi->mi_async_work_cv);
2086 while (mi->mi_threads != 0) {
2087 if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock))
2088 break;
2089 }
2090 rval = (mi->mi_threads != 0); /* Interrupted */
2091 if (rval)
2092 mi->mi_max_threads = omax;
2093 mutex_exit(&mi->mi_async_lock);
2094
2095 return (rval);
2096 }
2097
2098 int
2099 writerp(rnode_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated)
2100 {
2101 int pagecreate;
2102 int n;
2103 int saved_n;
2104 caddr_t saved_base;
2105 u_offset_t offset;
2106 int error;
2107 int sm_error;
2108
2109 ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid);
2110 ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE);
2111 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER));
2112
2113 /*
2114 * Move bytes in at most PAGESIZE chunks. We must avoid
2115 * spanning pages in uiomove() because page faults may cause
2116 * the cache to be invalidated out from under us. The r_size is not
2117 * updated until after the uiomove. If we push the last page of a
2118 * file before r_size is correct, we will lose the data written past
2119 * the current (and invalid) r_size.
2120 */
2121 do {
2122 offset = uio->uio_loffset;
2123 pagecreate = 0;
2124
2125 /*
2126 * n is the number of bytes required to satisfy the request
2127 * or the number of bytes to fill out the page.
2128 */
2129 n = (int)MIN((PAGESIZE - ((uintptr_t)base & PAGEOFFSET)),
2130 tcount);
2131
2132 /*
2133 * Check to see if we can skip reading in the page
2134 * and just allocate the memory. We can do this
2135 * if we are going to rewrite the entire mapping
2136 * or if we are going to write to or beyond the current
2137 * end of file from the beginning of the mapping.
2138 *
2139 * The read of r_size is now protected by r_statelock.
2140 */
2141 mutex_enter(&rp->r_statelock);
2142 /*
2143 * When pgcreated is nonzero the caller has already done
2144 * a segmap_getmapflt with forcefault 0 and S_WRITE. With
2145 * segkpm this means we already have at least one page
2146 * created and mapped at base.
2147 */
2148 pagecreate = pgcreated ||
2149 (((uintptr_t)base & PAGEOFFSET) == 0 &&
2150 (n == PAGESIZE || ((offset + n) >= rp->r_size)));
2151
2152 mutex_exit(&rp->r_statelock);
2153 if (pagecreate) {
2154 /*
2155 * The last argument tells segmap_pagecreate() to
2156 * always lock the page, as opposed to sometimes
2157 * returning with the page locked. This way we avoid a
2158 * fault on the ensuing uiomove(), but also
2159 * more importantly (to fix bug 1094402) we can
2160 * call segmap_fault() to unlock the page in all
2161 * cases. An alternative would be to modify
2162 * segmap_pagecreate() to tell us when it is
2163 * locking a page, but that's a fairly major
2164 * interface change.
2165 */
2166 if (pgcreated == 0)
2167 (void) segmap_pagecreate(segkmap, base,
2168 (uint_t)n, 1);
2169 saved_base = base;
2170 saved_n = n;
2171 }
2172
2173 /*
2174 * The number of bytes of data in the last page can not
2175 * be accurately be determined while page is being
2176 * uiomove'd to and the size of the file being updated.
2177 * Thus, inform threads which need to know accurately
2178 * how much data is in the last page of the file. They
2179 * will not do the i/o immediately, but will arrange for
2180 * the i/o to happen later when this modify operation
2181 * will have finished.
2182 */
2183 ASSERT(!(rp->r_flags & RMODINPROGRESS));
2184 mutex_enter(&rp->r_statelock);
2185 rp->r_flags |= RMODINPROGRESS;
2186 rp->r_modaddr = (offset & MAXBMASK);
2187 mutex_exit(&rp->r_statelock);
2188
2189 error = uiomove(base, n, UIO_WRITE, uio);
2190
2191 /*
2192 * r_size is the maximum number of
2193 * bytes known to be in the file.
2194 * Make sure it is at least as high as the
2195 * first unwritten byte pointed to by uio_loffset.
2196 */
2197 mutex_enter(&rp->r_statelock);
2198 if (rp->r_size < uio->uio_loffset)
2199 rp->r_size = uio->uio_loffset;
2200 rp->r_flags &= ~RMODINPROGRESS;
2201 rp->r_flags |= RDIRTY;
2202 mutex_exit(&rp->r_statelock);
2203
2204 /* n = # of bytes written */
2205 n = (int)(uio->uio_loffset - offset);
2206 base += n;
2207 tcount -= n;
2208 /*
2209 * If we created pages w/o initializing them completely,
2210 * we need to zero the part that wasn't set up.
2211 * This happens on a most EOF write cases and if
2212 * we had some sort of error during the uiomove.
2213 */
2214 if (pagecreate) {
2215 if ((uio->uio_loffset & PAGEOFFSET) || n == 0)
2216 (void) kzero(base, PAGESIZE - n);
2217
2218 if (pgcreated) {
2219 /*
2220 * Caller is responsible for this page,
2221 * it was not created in this loop.
2222 */
2223 pgcreated = 0;
2224 } else {
2225 /*
2226 * For bug 1094402: segmap_pagecreate locks
2227 * page. Unlock it. This also unlocks the
2228 * pages allocated by page_create_va() in
2229 * segmap_pagecreate().
2230 */
2231 sm_error = segmap_fault(kas.a_hat, segkmap,
2232 saved_base, saved_n,
2233 F_SOFTUNLOCK, S_WRITE);
2234 if (error == 0)
2235 error = sm_error;
2236 }
2237 }
2238 } while (tcount > 0 && error == 0);
2239
2240 return (error);
2241 }
2242
2243 int
2244 nfs_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr)
2245 {
2246 rnode_t *rp;
2247 page_t *pp;
2248 u_offset_t eoff;
2249 u_offset_t io_off;
2250 size_t io_len;
2251 int error;
2252 int rdirty;
2253 int err;
2254
2255 rp = VTOR(vp);
2256 ASSERT(rp->r_count > 0);
2257
2258 if (!vn_has_cached_data(vp))
2259 return (0);
2260
2261 ASSERT(vp->v_type != VCHR);
2262
2263 /*
2264 * If ROUTOFSPACE is set, then all writes turn into B_INVAL
2265 * writes. B_FORCE is set to force the VM system to actually
2266 * invalidate the pages, even if the i/o failed. The pages
2267 * need to get invalidated because they can't be written out
2268 * because there isn't any space left on either the server's
2269 * file system or in the user's disk quota. The B_FREE bit
2270 * is cleared to avoid confusion as to whether this is a
2271 * request to place the page on the freelist or to destroy
2272 * it.
2273 */
2274 if ((rp->r_flags & ROUTOFSPACE) ||
2275 (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED))
2276 flags = (flags & ~B_FREE) | B_INVAL | B_FORCE;
2277
2278 if (len == 0) {
2279 /*
2280 * If doing a full file synchronous operation, then clear
2281 * the RDIRTY bit. If a page gets dirtied while the flush
2282 * is happening, then RDIRTY will get set again. The
2283 * RDIRTY bit must get cleared before the flush so that
2284 * we don't lose this information.
2285 */
2286 if (off == (u_offset_t)0 &&
2287 !(flags & B_ASYNC) &&
2288 (rp->r_flags & RDIRTY)) {
2289 mutex_enter(&rp->r_statelock);
2290 rdirty = (rp->r_flags & RDIRTY);
2291 rp->r_flags &= ~RDIRTY;
2292 mutex_exit(&rp->r_statelock);
2293 } else
2294 rdirty = 0;
2295
2296 /*
2297 * Search the entire vp list for pages >= off, and flush
2298 * the dirty pages.
2299 */
2300 error = pvn_vplist_dirty(vp, off, rp->r_putapage,
2301 flags, cr);
2302
2303 /*
2304 * If an error occured and the file was marked as dirty
2305 * before and we aren't forcibly invalidating pages, then
2306 * reset the RDIRTY flag.
2307 */
2308 if (error && rdirty &&
2309 (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) {
2310 mutex_enter(&rp->r_statelock);
2311 rp->r_flags |= RDIRTY;
2312 mutex_exit(&rp->r_statelock);
2313 }
2314 } else {
2315 /*
2316 * Do a range from [off...off + len) looking for pages
2317 * to deal with.
2318 */
2319 error = 0;
2320 #ifdef lint
2321 io_len = 0;
2322 #endif
2323 eoff = off + len;
2324 mutex_enter(&rp->r_statelock);
2325 for (io_off = off; io_off < eoff && io_off < rp->r_size;
2326 io_off += io_len) {
2327 mutex_exit(&rp->r_statelock);
2328 /*
2329 * If we are not invalidating, synchronously
2330 * freeing or writing pages use the routine
2331 * page_lookup_nowait() to prevent reclaiming
2332 * them from the free list.
2333 */
2334 if ((flags & B_INVAL) || !(flags & B_ASYNC)) {
2335 pp = page_lookup(vp, io_off,
2336 (flags & (B_INVAL | B_FREE)) ?
2337 SE_EXCL : SE_SHARED);
2338 } else {
2339 pp = page_lookup_nowait(vp, io_off,
2340 (flags & B_FREE) ? SE_EXCL : SE_SHARED);
2341 }
2342
2343 if (pp == NULL || !pvn_getdirty(pp, flags))
2344 io_len = PAGESIZE;
2345 else {
2346 err = (*rp->r_putapage)(vp, pp, &io_off,
2347 &io_len, flags, cr);
2348 if (!error)
2349 error = err;
2350 /*
2351 * "io_off" and "io_len" are returned as
2352 * the range of pages we actually wrote.
2353 * This allows us to skip ahead more quickly
2354 * since several pages may've been dealt
2355 * with by this iteration of the loop.
2356 */
2357 }
2358 mutex_enter(&rp->r_statelock);
2359 }
2360 mutex_exit(&rp->r_statelock);
2361 }
2362
2363 return (error);
2364 }
2365
2366 void
2367 nfs_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr)
2368 {
2369 rnode_t *rp;
2370
2371 rp = VTOR(vp);
2372 mutex_enter(&rp->r_statelock);
2373 while (rp->r_flags & RTRUNCATE)
2374 cv_wait(&rp->r_cv, &rp->r_statelock);
2375 rp->r_flags |= RTRUNCATE;
2376 if (off == (u_offset_t)0) {
2377 rp->r_flags &= ~RDIRTY;
2378 if (!(rp->r_flags & RSTALE))
2379 rp->r_error = 0;
2380 }
2381 rp->r_truncaddr = off;
2382 mutex_exit(&rp->r_statelock);
2383 (void) pvn_vplist_dirty(vp, off, rp->r_putapage,
2384 B_INVAL | B_TRUNC, cr);
2385 mutex_enter(&rp->r_statelock);
2386 rp->r_flags &= ~RTRUNCATE;
2387 cv_broadcast(&rp->r_cv);
2388 mutex_exit(&rp->r_statelock);
2389 }
2390
2391 static int nfs_write_error_to_cons_only = 0;
2392 #define MSG(x) (nfs_write_error_to_cons_only ? (x) : (x) + 1)
2393
2394 /*
2395 * Print a file handle
2396 */
2397 void
2398 nfs_printfhandle(nfs_fhandle *fhp)
2399 {
2400 int *ip;
2401 char *buf;
2402 size_t bufsize;
2403 char *cp;
2404
2405 /*
2406 * 13 == "(file handle:"
2407 * maximum of NFS_FHANDLE / sizeof (*ip) elements in fh_buf times
2408 * 1 == ' '
2409 * 8 == maximum strlen of "%x"
2410 * 3 == ")\n\0"
2411 */
2412 bufsize = 13 + ((NFS_FHANDLE_LEN / sizeof (*ip)) * (1 + 8)) + 3;
2413 buf = kmem_alloc(bufsize, KM_NOSLEEP);
2414 if (buf == NULL)
2415 return;
2416
2417 cp = buf;
2418 (void) strcpy(cp, "(file handle:");
2419 while (*cp != '\0')
2420 cp++;
2421 for (ip = (int *)fhp->fh_buf;
2422 ip < (int *)&fhp->fh_buf[fhp->fh_len];
2423 ip++) {
2424 (void) sprintf(cp, " %x", *ip);
2425 while (*cp != '\0')
2426 cp++;
2427 }
2428 (void) strcpy(cp, ")\n");
2429
2430 zcmn_err(getzoneid(), CE_CONT, MSG("^%s"), buf);
2431
2432 kmem_free(buf, bufsize);
2433 }
2434
2435 /*
2436 * Notify the system administrator that an NFS write error has
2437 * occurred.
2438 */
2439
2440 /* seconds between ENOSPC/EDQUOT messages */
2441 clock_t nfs_write_error_interval = 5;
2442
2443 void
2444 nfs_write_error(vnode_t *vp, int error, cred_t *cr)
2445 {
2446 mntinfo_t *mi;
2447
2448 mi = VTOMI(vp);
2449 /*
2450 * In case of forced unmount or zone shutdown, do not print any
2451 * messages since it can flood the console with error messages.
2452 */
2453 if (FS_OR_ZONE_GONE(mi->mi_vfsp))
2454 return;
2455
2456 /*
2457 * No use in flooding the console with ENOSPC
2458 * messages from the same file system.
2459 */
2460 if ((error != ENOSPC && error != EDQUOT) ||
2461 lbolt - mi->mi_printftime > 0) {
2462 zoneid_t zoneid = mi->mi_zone->zone_id;
2463
2464 #ifdef DEBUG
2465 nfs_perror(error, "NFS%ld write error on host %s: %m.\n",
2466 mi->mi_vers, VTOR(vp)->r_server->sv_hostname, NULL);
2467 #else
2468 nfs_perror(error, "NFS write error on host %s: %m.\n",
2469 VTOR(vp)->r_server->sv_hostname, NULL);
2470 #endif
2471 if (error == ENOSPC || error == EDQUOT) {
2472 zcmn_err(zoneid, CE_CONT,
2473 MSG("^File: userid=%d, groupid=%d\n"),
2474 crgetuid(cr), crgetgid(cr));
2475 if (crgetuid(CRED()) != crgetuid(cr) ||
2476 crgetgid(CRED()) != crgetgid(cr)) {
2477 zcmn_err(zoneid, CE_CONT,
2478 MSG("^User: userid=%d, groupid=%d\n"),
2479 crgetuid(CRED()), crgetgid(CRED()));
2480 }
2481 mi->mi_printftime = lbolt +
2482 nfs_write_error_interval * hz;
2483 }
2484 nfs_printfhandle(&VTOR(vp)->r_fh);
2485 #ifdef DEBUG
2486 if (error == EACCES) {
2487 zcmn_err(zoneid, CE_CONT,
2488 MSG("^nfs_bio: cred is%s kcred\n"),
2489 cr == kcred ? "" : " not");
2490 }
2491 #endif
2492 }
2493 }
2494
2495 /* ARGSUSED */
2496 static void *
2497 nfs_mi_init(zoneid_t zoneid)
2498 {
2499 struct mi_globals *mig;
2500
2501 mig = kmem_alloc(sizeof (*mig), KM_SLEEP);
2502 mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL);
2503 list_create(&mig->mig_list, sizeof (mntinfo_t),
2504 offsetof(mntinfo_t, mi_zone_node));
2505 mig->mig_destructor_called = B_FALSE;
2506 return (mig);
2507 }
2508
2509 /*
2510 * Callback routine to tell all NFS mounts in the zone to stop creating new
2511 * threads. Existing threads should exit.
2512 */
2513 /* ARGSUSED */
2514 static void
2515 nfs_mi_shutdown(zoneid_t zoneid, void *data)
2516 {
2517 struct mi_globals *mig = data;
2518 mntinfo_t *mi;
2519
2520 ASSERT(mig != NULL);
2521 again:
2522 mutex_enter(&mig->mig_lock);
2523 for (mi = list_head(&mig->mig_list); mi != NULL;
2524 mi = list_next(&mig->mig_list, mi)) {
2525
2526 /*
2527 * If we've done the shutdown work for this FS, skip.
2528 * Once we go off the end of the list, we're done.
2529 */
2530 if (mi->mi_flags & MI_DEAD)
2531 continue;
2532
2533 /*
2534 * We will do work, so not done. Get a hold on the FS.
2535 */
2536 VFS_HOLD(mi->mi_vfsp);
2537
2538 /*
2539 * purge the DNLC for this filesystem
2540 */
2541 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0);
2542
2543 mutex_enter(&mi->mi_async_lock);
2544 /*
2545 * Tell existing async worker threads to exit.
2546 */
2547 mi->mi_max_threads = 0;
2548 cv_broadcast(&mi->mi_async_work_cv);
2549 /*
2550 * Set MI_ASYNC_MGR_STOP so the async manager thread starts
2551 * getting ready to exit when it's done with its current work.
2552 * Also set MI_DEAD to note we've acted on this FS.
2553 */
2554 mutex_enter(&mi->mi_lock);
2555 mi->mi_flags |= (MI_ASYNC_MGR_STOP|MI_DEAD);
2556 mutex_exit(&mi->mi_lock);
2557 /*
2558 * Wake up the async manager thread.
2559 */
2560 cv_broadcast(&mi->mi_async_reqs_cv);
2561 mutex_exit(&mi->mi_async_lock);
2562
2563 /*
2564 * Drop lock and release FS, which may change list, then repeat.
2565 * We're done when every mi has been done or the list is empty.
2566 */
2567 mutex_exit(&mig->mig_lock);
2568 VFS_RELE(mi->mi_vfsp);
2569 goto again;
2570 }
2571 mutex_exit(&mig->mig_lock);
2572 }
2573
2574 static void
2575 nfs_mi_free_globals(struct mi_globals *mig)
2576 {
2577 list_destroy(&mig->mig_list); /* makes sure the list is empty */
2578 mutex_destroy(&mig->mig_lock);
2579 kmem_free(mig, sizeof (*mig));
2580
2581 }
2582
2583 /* ARGSUSED */
2584 static void
2585 nfs_mi_destroy(zoneid_t zoneid, void *data)
2586 {
2587 struct mi_globals *mig = data;
2588
2589 ASSERT(mig != NULL);
2590 mutex_enter(&mig->mig_lock);
2591 if (list_head(&mig->mig_list) != NULL) {
2592 /* Still waiting for VFS_FREEVFS() */
2593 mig->mig_destructor_called = B_TRUE;
2594 mutex_exit(&mig->mig_lock);
2595 return;
2596 }
2597 nfs_mi_free_globals(mig);
2598 }
2599
2600 /*
2601 * Add an NFS mount to the per-zone list of NFS mounts.
2602 */
2603 void
2604 nfs_mi_zonelist_add(mntinfo_t *mi)
2605 {
2606 struct mi_globals *mig;
2607
2608 mig = zone_getspecific(mi_list_key, mi->mi_zone);
2609 mutex_enter(&mig->mig_lock);
2610 list_insert_head(&mig->mig_list, mi);
2611 mutex_exit(&mig->mig_lock);
2612 }
2613
2614 /*
2615 * Remove an NFS mount from the per-zone list of NFS mounts.
2616 */
2617 static void
2618 nfs_mi_zonelist_remove(mntinfo_t *mi)
2619 {
2620 struct mi_globals *mig;
2621
2622 mig = zone_getspecific(mi_list_key, mi->mi_zone);
2623 mutex_enter(&mig->mig_lock);
2624 list_remove(&mig->mig_list, mi);
2625 /*
2626 * We can be called asynchronously by VFS_FREEVFS() after the zone
2627 * shutdown/destroy callbacks have executed; if so, clean up the zone's
2628 * mi globals.
2629 */
2630 if (list_head(&mig->mig_list) == NULL &&
2631 mig->mig_destructor_called == B_TRUE) {
2632 nfs_mi_free_globals(mig);
2633 return;
2634 }
2635 mutex_exit(&mig->mig_lock);
2636 }
2637
2638 /*
2639 * NFS Client initialization routine. This routine should only be called
2640 * once. It performs the following tasks:
2641 * - Initalize all global locks
2642 * - Call sub-initialization routines (localize access to variables)
2643 */
2644 int
2645 nfs_clntinit(void)
2646 {
2647 #ifdef DEBUG
2648 static boolean_t nfs_clntup = B_FALSE;
2649 #endif
2650 int error;
2651
2652 #ifdef DEBUG
2653 ASSERT(nfs_clntup == B_FALSE);
2654 #endif
2655
2656 error = nfs_subrinit();
2657 if (error)
2658 return (error);
2659
2660 error = nfs_vfsinit();
2661 if (error) {
2662 /*
2663 * Cleanup nfs_subrinit() work
2664 */
2665 nfs_subrfini();
2666 return (error);
2667 }
2668 zone_key_create(&mi_list_key, nfs_mi_init, nfs_mi_shutdown,
2669 nfs_mi_destroy);
2670
2671 nfs4_clnt_init();
2672
2673 #ifdef DEBUG
2674 nfs_clntup = B_TRUE;
2675 #endif
2676
2677 return (0);
2678 }
2679
2680 /*
2681 * This routine is only called if the NFS Client has been initialized but
2682 * the module failed to be installed. This routine will cleanup the previously
2683 * allocated/initialized work.
2684 */
2685 void
2686 nfs_clntfini(void)
2687 {
2688 (void) zone_key_delete(mi_list_key);
2689 nfs_subrfini();
2690 nfs_vfsfini();
2691 nfs4_clnt_fini();
2692 }
2693
2694 /*
2695 * nfs_lockrelease:
2696 *
2697 * Release any locks on the given vnode that are held by the current
2698 * process.
2699 */
2700 void
2701 nfs_lockrelease(vnode_t *vp, int flag, offset_t offset, cred_t *cr)
2702 {
2703 flock64_t ld;
2704 struct shrlock shr;
2705 char *buf;
2706 int remote_lock_possible;
2707 int ret;
2708
2709 ASSERT((uintptr_t)vp > KERNELBASE);
2710
2711 /*
2712 * Generate an explicit unlock operation for the entire file. As a
2713 * partial optimization, only generate the unlock if there is a
2714 * lock registered for the file. We could check whether this
2715 * particular process has any locks on the file, but that would
2716 * require the local locking code to provide yet another query
2717 * routine. Note that no explicit synchronization is needed here.
2718 * At worst, flk_has_remote_locks() will return a false positive,
2719 * in which case the unlock call wastes time but doesn't harm
2720 * correctness.
2721 *
2722 * In addition, an unlock request is generated if the process
2723 * is listed as possibly having a lock on the file because the
2724 * server and client lock managers may have gotten out of sync.
2725 * N.B. It is important to make sure nfs_remove_locking_id() is
2726 * called here even if flk_has_remote_locks(vp) reports true.
2727 * If it is not called and there is an entry on the process id
2728 * list, that entry will never get removed.
2729 */
2730 remote_lock_possible = nfs_remove_locking_id(vp, RLMPL_PID,
2731 (char *)&(ttoproc(curthread)->p_pid), NULL, NULL);
2732 if (remote_lock_possible || flk_has_remote_locks(vp)) {
2733 ld.l_type = F_UNLCK; /* set to unlock entire file */
2734 ld.l_whence = 0; /* unlock from start of file */
2735 ld.l_start = 0;
2736 ld.l_len = 0; /* do entire file */
2737 ret = VOP_FRLOCK(vp, F_SETLK, &ld, flag, offset, NULL, cr);
2738
2739 if (ret != 0) {
2740 /*
2741 * If VOP_FRLOCK fails, make sure we unregister
2742 * local locks before we continue.
2743 */
2744 ld.l_pid = ttoproc(curthread)->p_pid;
2745 lm_register_lock_locally(vp, NULL, &ld, flag, offset);
2746 #ifdef DEBUG
2747 nfs_perror(ret,
2748 "NFS lock release error on vp %p: %m.\n",
2749 (void *)vp, NULL);
2750 #endif
2751 }
2752
2753 /*
2754 * The call to VOP_FRLOCK may put the pid back on the
2755 * list. We need to remove it.
2756 */
2757 (void) nfs_remove_locking_id(vp, RLMPL_PID,
2758 (char *)&(ttoproc(curthread)->p_pid), NULL, NULL);
2759 }
2760
2761 /*
2762 * As long as the vp has a share matching our pid,
2763 * pluck it off and unshare it. There are circumstances in
2764 * which the call to nfs_remove_locking_id() may put the
2765 * owner back on the list, in which case we simply do a
2766 * redundant and harmless unshare.
2767 */
2768 buf = kmem_alloc(MAX_SHR_OWNER_LEN, KM_SLEEP);
2769 while (nfs_remove_locking_id(vp, RLMPL_OWNER,
2770 (char *)NULL, buf, &shr.s_own_len)) {
2771 shr.s_owner = buf;
2772 shr.s_access = 0;
2773 shr.s_deny = 0;
2774 shr.s_sysid = 0;
2775 shr.s_pid = curproc->p_pid;
2776
2777 ret = VOP_SHRLOCK(vp, F_UNSHARE, &shr, flag, cr);
2778 #ifdef DEBUG
2779 if (ret != 0) {
2780 nfs_perror(ret,
2781 "NFS share release error on vp %p: %m.\n",
2782 (void *)vp, NULL);
2783 }
2784 #endif
2785 }
2786 kmem_free(buf, MAX_SHR_OWNER_LEN);
2787 }
2788
2789 /*
2790 * nfs_lockcompletion:
2791 *
2792 * If the vnode has a lock that makes it unsafe to cache the file, mark it
2793 * as non cachable (set VNOCACHE bit).
2794 */
2795
2796 void
2797 nfs_lockcompletion(vnode_t *vp, int cmd)
2798 {
2799 #ifdef DEBUG
2800 rnode_t *rp = VTOR(vp);
2801
2802 ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER));
2803 #endif
2804
2805 if (cmd == F_SETLK || cmd == F_SETLKW) {
2806 if (!lm_safemap(vp)) {
2807 mutex_enter(&vp->v_lock);
2808 vp->v_flag |= VNOCACHE;
2809 mutex_exit(&vp->v_lock);
2810 } else {
2811 mutex_enter(&vp->v_lock);
2812 vp->v_flag &= ~VNOCACHE;
2813 mutex_exit(&vp->v_lock);
2814 }
2815 }
2816 /*
2817 * The cached attributes of the file are stale after acquiring
2818 * the lock on the file. They were updated when the file was
2819 * opened, but not updated when the lock was acquired. Therefore the
2820 * cached attributes are invalidated after the lock is obtained.
2821 */
2822 PURGE_ATTRCACHE(vp);
2823 }
2824
2825 /*
2826 * The lock manager holds state making it possible for the client
2827 * and server to be out of sync. For example, if the response from
2828 * the server granting a lock request is lost, the server will think
2829 * the lock is granted and the client will think the lock is lost.
2830 * The client can tell when it is not positive if it is in sync with
2831 * the server.
2832 *
2833 * To deal with this, a list of processes for which the client is
2834 * not sure if the server holds a lock is attached to the rnode.
2835 * When such a process closes the rnode, an unlock request is sent
2836 * to the server to unlock the entire file.
2837 *
2838 * The list is kept as a singularly linked NULL terminated list.
2839 * Because it is only added to under extreme error conditions, the
2840 * list shouldn't get very big. DEBUG kernels print a message if
2841 * the list gets bigger than nfs_lmpl_high_water. This is arbitrarily
2842 * choosen to be 8, but can be tuned at runtime.
2843 */
2844 #ifdef DEBUG
2845 /* int nfs_lmpl_high_water = 8; */
2846 int nfs_lmpl_high_water = 128;
2847 int nfs_cnt_add_locking_id = 0;
2848 int nfs_len_add_locking_id = 0;
2849 #endif /* DEBUG */
2850
2851 /*
2852 * Record that the nfs lock manager server may be holding a lock on
2853 * a vnode for a process.
2854 *
2855 * Because the nfs lock manager server holds state, it is possible
2856 * for the server to get out of sync with the client. This routine is called
2857 * from the client when it is no longer sure if the server is in sync
2858 * with the client. nfs_lockrelease() will then notice this and send
2859 * an unlock request when the file is closed
2860 */
2861 void
2862 nfs_add_locking_id(vnode_t *vp, pid_t pid, int type, char *id, int len)
2863 {
2864 rnode_t *rp;
2865 lmpl_t *new;
2866 lmpl_t *cur;
2867 lmpl_t **lmplp;
2868 #ifdef DEBUG
2869 int list_len = 1;
2870 #endif /* DEBUG */
2871
2872 #ifdef DEBUG
2873 ++nfs_cnt_add_locking_id;
2874 #endif /* DEBUG */
2875 /*
2876 * allocate new lmpl_t now so we don't sleep
2877 * later after grabbing mutexes
2878 */
2879 ASSERT(len < MAX_SHR_OWNER_LEN);
2880 new = kmem_alloc(sizeof (*new), KM_SLEEP);
2881 new->lmpl_type = type;
2882 new->lmpl_pid = pid;
2883 new->lmpl_owner = kmem_alloc(len, KM_SLEEP);
2884 bcopy(id, new->lmpl_owner, len);
2885 new->lmpl_own_len = len;
2886 new->lmpl_next = (lmpl_t *)NULL;
2887 #ifdef DEBUG
2888 if (type == RLMPL_PID) {
2889 ASSERT(len == sizeof (pid_t));
2890 ASSERT(pid == *(pid_t *)new->lmpl_owner);
2891 } else {
2892 ASSERT(type == RLMPL_OWNER);
2893 }
2894 #endif
2895
2896 rp = VTOR(vp);
2897 mutex_enter(&rp->r_statelock);
2898
2899 /*
2900 * Add this id to the list for this rnode only if the
2901 * rnode is active and the id is not already there.
2902 */
2903 ASSERT(rp->r_flags & RHASHED);
2904 lmplp = &(rp->r_lmpl);
2905 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; cur = cur->lmpl_next) {
2906 if (cur->lmpl_pid == pid &&
2907 cur->lmpl_type == type &&
2908 cur->lmpl_own_len == len &&
2909 bcmp(cur->lmpl_owner, new->lmpl_owner, len) == 0) {
2910 kmem_free(new->lmpl_owner, len);
2911 kmem_free(new, sizeof (*new));
2912 break;
2913 }
2914 lmplp = &cur->lmpl_next;
2915 #ifdef DEBUG
2916 ++list_len;
2917 #endif /* DEBUG */
2918 }
2919 if (cur == (lmpl_t *)NULL) {
2920 *lmplp = new;
2921 #ifdef DEBUG
2922 if (list_len > nfs_len_add_locking_id) {
2923 nfs_len_add_locking_id = list_len;
2924 }
2925 if (list_len > nfs_lmpl_high_water) {
2926 cmn_err(CE_WARN, "nfs_add_locking_id: long list "
2927 "vp=%p is %d", (void *)vp, list_len);
2928 }
2929 #endif /* DEBUG */
2930 }
2931
2932 #ifdef DEBUG
2933 if (share_debug) {
2934 int nitems = 0;
2935 int npids = 0;
2936 int nowners = 0;
2937
2938 /*
2939 * Count the number of things left on r_lmpl after the remove.
2940 */
2941 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL;
2942 cur = cur->lmpl_next) {
2943 nitems++;
2944 if (cur->lmpl_type == RLMPL_PID) {
2945 npids++;
2946 } else if (cur->lmpl_type == RLMPL_OWNER) {
2947 nowners++;
2948 } else {
2949 cmn_err(CE_PANIC, "nfs_add_locking_id: "
2950 "unrecognised lmpl_type %d",
2951 cur->lmpl_type);
2952 }
2953 }
2954
2955 cmn_err(CE_CONT, "nfs_add_locking_id(%s): %d PIDs + %d "
2956 "OWNs = %d items left on r_lmpl\n",
2957 (type == RLMPL_PID) ? "P" : "O", npids, nowners, nitems);
2958 }
2959 #endif
2960
2961 mutex_exit(&rp->r_statelock);
2962 }
2963
2964 /*
2965 * Remove an id from the lock manager id list.
2966 *
2967 * If the id is not in the list return 0. If it was found and
2968 * removed, return 1.
2969 */
2970 static int
2971 nfs_remove_locking_id(vnode_t *vp, int type, char *id, char *rid, int *rlen)
2972 {
2973 lmpl_t *cur;
2974 lmpl_t **lmplp;
2975 rnode_t *rp;
2976 int rv = 0;
2977
2978 ASSERT(type == RLMPL_PID || type == RLMPL_OWNER);
2979
2980 rp = VTOR(vp);
2981
2982 mutex_enter(&rp->r_statelock);
2983 ASSERT(rp->r_flags & RHASHED);
2984 lmplp = &(rp->r_lmpl);
2985
2986 /*
2987 * Search through the list and remove the entry for this id
2988 * if it is there. The special case id == NULL allows removal
2989 * of the first share on the r_lmpl list belonging to the
2990 * current process (if any), without regard to further details
2991 * of its identity.
2992 */
2993 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; cur = cur->lmpl_next) {
2994 if (cur->lmpl_type == type &&
2995 cur->lmpl_pid == curproc->p_pid &&
2996 (id == (char *)NULL ||
2997 bcmp(cur->lmpl_owner, id, cur->lmpl_own_len) == 0)) {
2998 *lmplp = cur->lmpl_next;
2999 ASSERT(cur->lmpl_own_len < MAX_SHR_OWNER_LEN);
3000 if (rid != NULL) {
3001 bcopy(cur->lmpl_owner, rid, cur->lmpl_own_len);
3002 *rlen = cur->lmpl_own_len;
3003 }
3004 kmem_free(cur->lmpl_owner, cur->lmpl_own_len);
3005 kmem_free(cur, sizeof (*cur));
3006 rv = 1;
3007 break;
3008 }
3009 lmplp = &cur->lmpl_next;
3010 }
3011
3012 #ifdef DEBUG
3013 if (share_debug) {
3014 int nitems = 0;
3015 int npids = 0;
3016 int nowners = 0;
3017
3018 /*
3019 * Count the number of things left on r_lmpl after the remove.
3020 */
3021 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL;
3022 cur = cur->lmpl_next) {
3023 nitems++;
3024 if (cur->lmpl_type == RLMPL_PID) {
3025 npids++;
3026 } else if (cur->lmpl_type == RLMPL_OWNER) {
3027 nowners++;
3028 } else {
3029 cmn_err(CE_PANIC,
3030 "nrli: unrecognised lmpl_type %d",
3031 cur->lmpl_type);
3032 }
3033 }
3034
3035 cmn_err(CE_CONT,
3036 "nrli(%s): %d PIDs + %d OWNs = %d items left on r_lmpl\n",
3037 (type == RLMPL_PID) ? "P" : "O",
3038 npids,
3039 nowners,
3040 nitems);
3041 }
3042 #endif
3043
3044 mutex_exit(&rp->r_statelock);
3045 return (rv);
3046 }
3047
3048 void
3049 nfs_free_mi(mntinfo_t *mi)
3050 {
3051 ASSERT(mi->mi_flags & MI_ASYNC_MGR_STOP);
3052 ASSERT(mi->mi_manager_thread == NULL);
3053 ASSERT(mi->mi_threads == 0);
3054
3055 /*
3056 * Remove the node from the global list before we start tearing it down.
3057 */
3058 nfs_mi_zonelist_remove(mi);
3059 if (mi->mi_klmconfig) {
3060 lm_free_config(mi->mi_klmconfig);
3061 kmem_free(mi->mi_klmconfig, sizeof (struct knetconfig));
3062 }
3063 mutex_destroy(&mi->mi_lock);
3064 mutex_destroy(&mi->mi_remap_lock);
3065 mutex_destroy(&mi->mi_async_lock);
3066 cv_destroy(&mi->mi_failover_cv);
3067 cv_destroy(&mi->mi_async_work_cv);
3068 cv_destroy(&mi->mi_async_reqs_cv);
3069 cv_destroy(&mi->mi_async_cv);
3070 zone_rele(mi->mi_zone);
3071 kmem_free(mi, sizeof (*mi));
3072 }
3073
3074 static int
3075 mnt_kstat_update(kstat_t *ksp, int rw)
3076 {
3077 mntinfo_t *mi;
3078 struct mntinfo_kstat *mik;
3079 vfs_t *vfsp;
3080 int i;
3081
3082 /* this is a read-only kstat. Bail out on a write */
3083 if (rw == KSTAT_WRITE)
3084 return (EACCES);
3085
3086 /*
3087 * We don't want to wait here as kstat_chain_lock could be held by
3088 * dounmount(). dounmount() takes vfs_reflock before the chain lock
3089 * and thus could lead to a deadlock.
3090 */
3091 vfsp = (struct vfs *)ksp->ks_private;
3092
3093
3094 mi = VFTOMI(vfsp);
3095
3096 mik = (struct mntinfo_kstat *)ksp->ks_data;
3097
3098 (void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto);
3099 mik->mik_vers = (uint32_t)mi->mi_vers;
3100 mik->mik_flags = mi->mi_flags;
3101 mik->mik_secmod = mi->mi_curr_serv->sv_secdata->secmod;
3102 mik->mik_curread = (uint32_t)mi->mi_curread;
3103 mik->mik_curwrite = (uint32_t)mi->mi_curwrite;
3104 mik->mik_retrans = mi->mi_retrans;
3105 mik->mik_timeo = mi->mi_timeo;
3106 mik->mik_acregmin = HR2SEC(mi->mi_acregmin);
3107 mik->mik_acregmax = HR2SEC(mi->mi_acregmax);
3108 mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin);
3109 mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax);
3110 for (i = 0; i < NFS_CALLTYPES + 1; i++) {
3111 mik->mik_timers[i].srtt = (uint32_t)mi->mi_timers[i].rt_srtt;
3112 mik->mik_timers[i].deviate =
3113 (uint32_t)mi->mi_timers[i].rt_deviate;
3114 mik->mik_timers[i].rtxcur =
3115 (uint32_t)mi->mi_timers[i].rt_rtxcur;
3116 }
3117 mik->mik_noresponse = (uint32_t)mi->mi_noresponse;
3118 mik->mik_failover = (uint32_t)mi->mi_failover;
3119 mik->mik_remap = (uint32_t)mi->mi_remap;
3120 (void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname);
3121
3122 return (0);
3123 }
3124
3125 void
3126 nfs_mnt_kstat_init(struct vfs *vfsp)
3127 {
3128 mntinfo_t *mi = VFTOMI(vfsp);
3129
3130 /*
3131 * Create the version specific kstats.
3132 *
3133 * PSARC 2001/697 Contract Private Interface
3134 * All nfs kstats are under SunMC contract
3135 * Please refer to the PSARC listed above and contact
3136 * SunMC before making any changes!
3137 *
3138 * Changes must be reviewed by Solaris File Sharing
3139 * Changes must be communicated to contract-2001-697@sun.com
3140 *
3141 */
3142
3143 mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev),
3144 NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id);
3145 if (mi->mi_io_kstats) {
3146 if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
3147 kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID);
3148 mi->mi_io_kstats->ks_lock = &mi->mi_lock;
3149 kstat_install(mi->mi_io_kstats);
3150 }
3151
3152 if ((mi->mi_ro_kstats = kstat_create_zone("nfs",
3153 getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW,
3154 sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) {
3155 if (mi->mi_zone->zone_id != GLOBAL_ZONEID)
3156 kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID);
3157 mi->mi_ro_kstats->ks_update = mnt_kstat_update;
3158 mi->mi_ro_kstats->ks_private = (void *)vfsp;
3159 kstat_install(mi->mi_ro_kstats);
3160 }
3161 }
3162
3163 nfs_delmapcall_t *
3164 nfs_init_delmapcall()
3165 {
3166 nfs_delmapcall_t *delmap_call;
3167
3168 delmap_call = kmem_alloc(sizeof (nfs_delmapcall_t), KM_SLEEP);
3169 delmap_call->call_id = curthread;
3170 delmap_call->error = 0;
3171
3172 return (delmap_call);
3173 }
3174
3175 void
3176 nfs_free_delmapcall(nfs_delmapcall_t *delmap_call)
3177 {
3178 kmem_free(delmap_call, sizeof (nfs_delmapcall_t));
3179 }
3180
3181 /*
3182 * Searches for the current delmap caller (based on curthread) in the list of
3183 * callers. If it is found, we remove it and free the delmap caller.
3184 * Returns:
3185 * 0 if the caller wasn't found
3186 * 1 if the caller was found, removed and freed. *errp is set to what
3187 * the result of the delmap was.
3188 */
3189 int
3190 nfs_find_and_delete_delmapcall(rnode_t *rp, int *errp)
3191 {
3192 nfs_delmapcall_t *delmap_call;
3193
3194 /*
3195 * If the list doesn't exist yet, we create it and return
3196 * that the caller wasn't found. No list = no callers.
3197 */
3198 mutex_enter(&rp->r_statelock);
3199 if (!(rp->r_flags & RDELMAPLIST)) {
3200 /* The list does not exist */
3201 list_create(&rp->r_indelmap, sizeof (nfs_delmapcall_t),
3202 offsetof(nfs_delmapcall_t, call_node));
3203 rp->r_flags |= RDELMAPLIST;
3204 mutex_exit(&rp->r_statelock);
3205 return (0);
3206 } else {
3207 /* The list exists so search it */
3208 for (delmap_call = list_head(&rp->r_indelmap);
3209 delmap_call != NULL;
3210 delmap_call = list_next(&rp->r_indelmap, delmap_call)) {
3211 if (delmap_call->call_id == curthread) {
3212 /* current caller is in the list */
3213 *errp = delmap_call->error;
3214 list_remove(&rp->r_indelmap, delmap_call);
3215 mutex_exit(&rp->r_statelock);
3216 nfs_free_delmapcall(delmap_call);
3217 return (1);
3218 }
3219 }
3220 }
3221 mutex_exit(&rp->r_statelock);
3222 return (0);
3223 }