1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 * 26 * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. 27 * All rights reserved. 28 */ 29 30 /* Copyright (c) 2006, The Ohio State University. All rights reserved. 31 * 32 * Portions of this source code is developed by the team members of 33 * The Ohio State University's Network-Based Computing Laboratory (NBCL), 34 * headed by Professor Dhabaleswar K. (DK) Panda. 35 * 36 * Acknowledgements to contributions from developors: 37 * Ranjit Noronha: noronha@cse.ohio-state.edu 38 * Lei Chai : chail@cse.ohio-state.edu 39 * Weikuan Yu : yuw@cse.ohio-state.edu 40 * 41 */ 42 43 #pragma ident "@(#)nfs_client.c 1.193 05/10/11 SMI" 44 45 #include <sys/param.h> 46 #include <sys/types.h> 47 #include <sys/systm.h> 48 #include <sys/thread.h> 49 #include <sys/t_lock.h> 50 #include <sys/time.h> 51 #include <sys/vnode.h> 52 #include <sys/vfs.h> 53 #include <sys/errno.h> 54 #include <sys/buf.h> 55 #include <sys/stat.h> 56 #include <sys/cred.h> 57 #include <sys/kmem.h> 58 #include <sys/debug.h> 59 #include <sys/dnlc.h> 60 #include <sys/vmsystm.h> 61 #include <sys/flock.h> 62 #include <sys/share.h> 63 #include <sys/cmn_err.h> 64 #include <sys/tiuser.h> 65 #include <sys/sysmacros.h> 66 #include <sys/callb.h> 67 #include <sys/acl.h> 68 #include <sys/kstat.h> 69 #include <sys/signal.h> 70 #include <sys/list.h> 71 #include <sys/zone.h> 72 73 #include <rpc/types.h> 74 #include <rpc/xdr.h> 75 #include <rpc/auth.h> 76 #include <rpc/clnt.h> 77 78 #include <nfs/nfs.h> 79 #include <nfs/nfs_clnt.h> 80 81 #include <nfs/rnode.h> 82 #include <nfs/nfs_acl.h> 83 #include <nfs/lm.h> 84 85 #include <vm/hat.h> 86 #include <vm/as.h> 87 #include <vm/page.h> 88 #include <vm/pvn.h> 89 #include <vm/seg.h> 90 #include <vm/seg_map.h> 91 #include <vm/seg_vn.h> 92 93 static void nfs3_attr_cache(vnode_t *, vattr_t *, vattr_t *, hrtime_t, 94 cred_t *); 95 static int nfs_getattr_cache(vnode_t *, struct vattr *); 96 static int nfs_remove_locking_id(vnode_t *, int, char *, char *, int *); 97 98 struct mi_globals { 99 kmutex_t mig_lock; /* lock protecting mig_list */ 100 list_t mig_list; /* list of NFS v2 or v3 mounts in zone */ 101 boolean_t mig_destructor_called; 102 }; 103 104 static zone_key_t mi_list_key; 105 106 /* Debugging flag for PC file shares. */ 107 extern int share_debug; 108 109 /* 110 * used by RDMA transport to easily recognize READ3 call/reply 111 * (FTDO -- for the demo only. Better design needed for NFS4 or ON10 putback) 112 */ 113 114 extern xdrproc_t x_READ3args; 115 extern xdrproc_t x_READ3res; 116 extern xdrproc_t x_READ3uiores; 117 extern xdrproc_t x_READ3vres; 118 119 /* 120 * Attributes caching: 121 * 122 * Attributes are cached in the rnode in struct vattr form. 123 * There is a time associated with the cached attributes (r_attrtime) 124 * which tells whether the attributes are valid. The time is initialized 125 * to the difference between current time and the modify time of the vnode 126 * when new attributes are cached. This allows the attributes for 127 * files that have changed recently to be timed out sooner than for files 128 * that have not changed for a long time. There are minimum and maximum 129 * timeout values that can be set per mount point. 130 */ 131 132 int 133 nfs_waitfor_purge_complete(vnode_t *vp) 134 { 135 rnode_t *rp; 136 k_sigset_t smask; 137 138 rp = VTOR(vp); 139 if (rp->r_serial != NULL && rp->r_serial != curthread) { 140 mutex_enter(&rp->r_statelock); 141 sigintr(&smask, VTOMI(vp)->mi_flags & MI_INT); 142 while (rp->r_serial != NULL) { 143 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 144 sigunintr(&smask); 145 mutex_exit(&rp->r_statelock); 146 return (EINTR); 147 } 148 } 149 sigunintr(&smask); 150 mutex_exit(&rp->r_statelock); 151 } 152 return (0); 153 } 154 155 /* 156 * Validate caches by checking cached attributes. If the cached 157 * attributes have timed out, then get new attributes from the server. 158 * As a side affect, this will do cache invalidation if the attributes 159 * have changed. 160 * 161 * If the attributes have not timed out and if there is a cache 162 * invalidation being done by some other thread, then wait until that 163 * thread has completed the cache invalidation. 164 */ 165 int 166 nfs_validate_caches(vnode_t *vp, cred_t *cr) 167 { 168 int error; 169 struct vattr va; 170 171 if (ATTRCACHE_VALID(vp)) { 172 error = nfs_waitfor_purge_complete(vp); 173 if (error) 174 return (error); 175 return (0); 176 } 177 178 va.va_mask = AT_ALL; 179 return (nfs_getattr_otw(vp, &va, cr)); 180 } 181 182 /* 183 * Validate caches by checking cached attributes. If the cached 184 * attributes have timed out, then get new attributes from the server. 185 * As a side affect, this will do cache invalidation if the attributes 186 * have changed. 187 * 188 * If the attributes have not timed out and if there is a cache 189 * invalidation being done by some other thread, then wait until that 190 * thread has completed the cache invalidation. 191 */ 192 int 193 nfs3_validate_caches(vnode_t *vp, cred_t *cr) 194 { 195 int error; 196 struct vattr va; 197 198 if (ATTRCACHE_VALID(vp)) { 199 error = nfs_waitfor_purge_complete(vp); 200 if (error) 201 return (error); 202 return (0); 203 } 204 205 va.va_mask = AT_ALL; 206 return (nfs3_getattr_otw(vp, &va, cr)); 207 } 208 209 /* 210 * Purge all of the various NFS `data' caches. 211 */ 212 void 213 nfs_purge_caches(vnode_t *vp, int purge_dnlc, cred_t *cr) 214 { 215 rnode_t *rp; 216 char *contents; 217 int size; 218 int error; 219 220 /* 221 * Purge the DNLC for any entries which refer to this file. 222 * Avoid recursive entry into dnlc_purge_vp() in case of a directory. 223 */ 224 rp = VTOR(vp); 225 mutex_enter(&rp->r_statelock); 226 if (vp->v_count > 1 && 227 (vp->v_type == VDIR || purge_dnlc == NFS_PURGE_DNLC) && 228 !(rp->r_flags & RINDNLCPURGE)) { 229 /* 230 * Set the RINDNLCPURGE flag to prevent recursive entry 231 * into dnlc_purge_vp() 232 */ 233 if (vp->v_type == VDIR) 234 rp->r_flags |= RINDNLCPURGE; 235 mutex_exit(&rp->r_statelock); 236 dnlc_purge_vp(vp); 237 mutex_enter(&rp->r_statelock); 238 if (rp->r_flags & RINDNLCPURGE) 239 rp->r_flags &= ~RINDNLCPURGE; 240 } 241 242 /* 243 * Clear any readdir state bits and purge the readlink response cache. 244 */ 245 contents = rp->r_symlink.contents; 246 size = rp->r_symlink.size; 247 rp->r_symlink.contents = NULL; 248 mutex_exit(&rp->r_statelock); 249 250 if (contents != NULL) { 251 252 kmem_free((void *)contents, size); 253 } 254 255 /* 256 * Flush the page cache. 257 */ 258 if (vn_has_cached_data(vp)) { 259 error = VOP_PUTPAGE(vp, (u_offset_t)0, 0, B_INVAL, cr); 260 if (error && (error == ENOSPC || error == EDQUOT)) { 261 mutex_enter(&rp->r_statelock); 262 if (!rp->r_error) 263 rp->r_error = error; 264 mutex_exit(&rp->r_statelock); 265 } 266 } 267 268 /* 269 * Flush the readdir response cache. 270 */ 271 if (HAVE_RDDIR_CACHE(rp)) 272 nfs_purge_rddir_cache(vp); 273 } 274 275 /* 276 * Purge the readdir cache of all entries 277 */ 278 void 279 nfs_purge_rddir_cache(vnode_t *vp) 280 { 281 rnode_t *rp; 282 rddir_cache *rdc; 283 rddir_cache *nrdc; 284 285 rp = VTOR(vp); 286 top: 287 mutex_enter(&rp->r_statelock); 288 rp->r_direof = NULL; 289 rp->r_flags &= ~RLOOKUP; 290 rp->r_flags |= RREADDIRPLUS; 291 rdc = avl_first(&rp->r_dir); 292 while (rdc != NULL) { 293 nrdc = AVL_NEXT(&rp->r_dir, rdc); 294 avl_remove(&rp->r_dir, rdc); 295 rddir_cache_rele(rdc); 296 rdc = nrdc; 297 } 298 mutex_exit(&rp->r_statelock); 299 } 300 301 /* 302 * Do a cache check based on the post-operation attributes. 303 * Then make them the new cached attributes. If no attributes 304 * were returned, then mark the attributes as timed out. 305 */ 306 void 307 nfs3_cache_post_op_attr(vnode_t *vp, post_op_attr *poap, hrtime_t t, cred_t *cr) 308 { 309 vattr_t attr; 310 311 if (!poap->attributes) { 312 PURGE_ATTRCACHE(vp); 313 return; 314 } 315 (void) nfs3_cache_fattr3(vp, &poap->attr, &attr, t, cr); 316 } 317 318 /* 319 * Same as above, but using a vattr 320 */ 321 void 322 nfs3_cache_post_op_vattr(vnode_t *vp, post_op_vattr *poap, hrtime_t t, 323 cred_t *cr) 324 { 325 if (!poap->attributes) { 326 PURGE_ATTRCACHE(vp); 327 return; 328 } 329 nfs_attr_cache(vp, poap->fres.vap, t, cr); 330 } 331 332 /* 333 * Do a cache check based on the weak cache consistency attributes. 334 * These consist of a small set of pre-operation attributes and the 335 * full set of post-operation attributes. 336 * 337 * If we are given the pre-operation attributes, then use them to 338 * check the validity of the various caches. Then, if we got the 339 * post-operation attributes, make them the new cached attributes. 340 * If we didn't get the post-operation attributes, then mark the 341 * attribute cache as timed out so that the next reference will 342 * cause a GETATTR to the server to refresh with the current 343 * attributes. 344 * 345 * Otherwise, if we didn't get the pre-operation attributes, but 346 * we did get the post-operation attributes, then use these 347 * attributes to check the validity of the various caches. This 348 * will probably cause a flush of the caches because if the 349 * operation succeeded, the attributes of the object were changed 350 * in some way from the old post-operation attributes. This 351 * should be okay because it is the safe thing to do. After 352 * checking the data caches, then we make these the new cached 353 * attributes. 354 * 355 * Otherwise, we didn't get either the pre- or post-operation 356 * attributes. Simply mark the attribute cache as timed out so 357 * the next reference will cause a GETATTR to the server to 358 * refresh with the current attributes. 359 * 360 * If an error occurred trying to convert the over the wire 361 * attributes to a vattr, then simply mark the attribute cache as 362 * timed out. 363 */ 364 void 365 nfs3_cache_wcc_data(vnode_t *vp, wcc_data *wccp, hrtime_t t, cred_t *cr) 366 { 367 vattr_t bva; 368 vattr_t ava; 369 370 if (wccp->after.attributes) { 371 if (fattr3_to_vattr(vp, &wccp->after.attr, &ava)) { 372 PURGE_ATTRCACHE(vp); 373 return; 374 } 375 if (wccp->before.attributes) { 376 bva.va_ctime.tv_sec = wccp->before.attr.ctime.seconds; 377 bva.va_ctime.tv_nsec = wccp->before.attr.ctime.nseconds; 378 bva.va_mtime.tv_sec = wccp->before.attr.mtime.seconds; 379 bva.va_mtime.tv_nsec = wccp->before.attr.mtime.nseconds; 380 bva.va_size = wccp->before.attr.size; 381 nfs3_attr_cache(vp, &bva, &ava, t, cr); 382 } else 383 nfs_attr_cache(vp, &ava, t, cr); 384 } else { 385 PURGE_ATTRCACHE(vp); 386 } 387 } 388 389 /* 390 * Set attributes cache for given vnode using nfsattr. 391 * 392 * This routine does not do cache validation with the attributes. 393 * 394 * If an error occurred trying to convert the over the wire 395 * attributes to a vattr, then simply mark the attribute cache as 396 * timed out. 397 */ 398 void 399 nfs_attrcache(vnode_t *vp, struct nfsfattr *na, hrtime_t t) 400 { 401 rnode_t *rp; 402 struct vattr va; 403 404 if (!nattr_to_vattr(vp, na, &va)) { 405 rp = VTOR(vp); 406 mutex_enter(&rp->r_statelock); 407 if (rp->r_mtime <= t) 408 nfs_attrcache_va(vp, &va); 409 mutex_exit(&rp->r_statelock); 410 } else { 411 PURGE_ATTRCACHE(vp); 412 } 413 } 414 415 /* 416 * Set attributes cache for given vnode using fattr3. 417 * 418 * This routine does not do cache validation with the attributes. 419 * 420 * If an error occurred trying to convert the over the wire 421 * attributes to a vattr, then simply mark the attribute cache as 422 * timed out. 423 */ 424 void 425 nfs3_attrcache(vnode_t *vp, fattr3 *na, hrtime_t t) 426 { 427 rnode_t *rp; 428 struct vattr va; 429 430 if (!fattr3_to_vattr(vp, na, &va)) { 431 rp = VTOR(vp); 432 mutex_enter(&rp->r_statelock); 433 if (rp->r_mtime <= t) 434 nfs_attrcache_va(vp, &va); 435 mutex_exit(&rp->r_statelock); 436 } else { 437 PURGE_ATTRCACHE(vp); 438 } 439 } 440 441 /* 442 * Do a cache check based on attributes returned over the wire. The 443 * new attributes are cached. 444 * 445 * If an error occurred trying to convert the over the wire attributes 446 * to a vattr, then just return that error. 447 * 448 * As a side affect, the vattr argument is filled in with the converted 449 * attributes. 450 */ 451 int 452 nfs_cache_fattr(vnode_t *vp, struct nfsfattr *na, vattr_t *vap, hrtime_t t, 453 cred_t *cr) 454 { 455 int error; 456 457 error = nattr_to_vattr(vp, na, vap); 458 if (error) 459 return (error); 460 nfs_attr_cache(vp, vap, t, cr); 461 return (0); 462 } 463 464 /* 465 * Do a cache check based on attributes returned over the wire. The 466 * new attributes are cached. 467 * 468 * If an error occurred trying to convert the over the wire attributes 469 * to a vattr, then just return that error. 470 * 471 * As a side affect, the vattr argument is filled in with the converted 472 * attributes. 473 */ 474 int 475 nfs3_cache_fattr3(vnode_t *vp, fattr3 *na, vattr_t *vap, hrtime_t t, cred_t *cr) 476 { 477 int error; 478 479 error = fattr3_to_vattr(vp, na, vap); 480 if (error) 481 return (error); 482 nfs_attr_cache(vp, vap, t, cr); 483 return (0); 484 } 485 486 /* 487 * Use the passed in virtual attributes to check to see whether the 488 * data and metadata caches are valid, cache the new attributes, and 489 * then do the cache invalidation if required. 490 * 491 * The cache validation and caching of the new attributes is done 492 * atomically via the use of the mutex, r_statelock. If required, 493 * the cache invalidation is done atomically w.r.t. the cache 494 * validation and caching of the attributes via the pseudo lock, 495 * r_serial. 496 * 497 * This routine is used to do cache validation and attributes caching 498 * for operations with a single set of post operation attributes. 499 */ 500 void 501 nfs_attr_cache(vnode_t *vp, vattr_t *vap, hrtime_t t, cred_t *cr) 502 { 503 rnode_t *rp; 504 int mtime_changed; 505 int ctime_changed; 506 vsecattr_t *vsp; 507 int was_serial; 508 509 rp = VTOR(vp); 510 511 mutex_enter(&rp->r_statelock); 512 513 if (rp->r_serial != curthread) { 514 klwp_t *lwp = ttolwp(curthread); 515 516 was_serial = 0; 517 if (lwp != NULL) 518 lwp->lwp_nostop++; 519 while (rp->r_serial != NULL) { 520 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 521 mutex_exit(&rp->r_statelock); 522 if (lwp != NULL) 523 lwp->lwp_nostop--; 524 return; 525 } 526 } 527 if (lwp != NULL) 528 lwp->lwp_nostop--; 529 } else 530 was_serial = 1; 531 532 if (rp->r_mtime > t) { 533 mutex_exit(&rp->r_statelock); 534 return; 535 } 536 537 if (!(rp->r_flags & RWRITEATTR)) { 538 if (!CACHE_VALID(rp, vap->va_mtime, vap->va_size)) 539 mtime_changed = 1; 540 else 541 mtime_changed = 0; 542 if (rp->r_attr.va_ctime.tv_sec != vap->va_ctime.tv_sec || 543 rp->r_attr.va_ctime.tv_nsec != vap->va_ctime.tv_nsec) 544 ctime_changed = 1; 545 else 546 ctime_changed = 0; 547 } else if (rp->r_size != vap->va_size && 548 (!vn_has_cached_data(vp) || 549 (!(rp->r_flags & RDIRTY) && rp->r_count == 0))) { 550 mtime_changed = 1; 551 ctime_changed = 0; 552 } else { 553 mtime_changed = 0; 554 ctime_changed = 0; 555 } 556 557 nfs_attrcache_va(vp, vap); 558 559 if (!mtime_changed && !ctime_changed) { 560 mutex_exit(&rp->r_statelock); 561 return; 562 } 563 564 rp->r_serial = curthread; 565 566 mutex_exit(&rp->r_statelock); 567 568 if (mtime_changed) 569 nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr); 570 571 if (ctime_changed) { 572 (void) nfs_access_purge_rp(rp); 573 if (rp->r_secattr != NULL) { 574 mutex_enter(&rp->r_statelock); 575 vsp = rp->r_secattr; 576 rp->r_secattr = NULL; 577 mutex_exit(&rp->r_statelock); 578 if (vsp != NULL) 579 nfs_acl_free(vsp); 580 } 581 } 582 583 if (!was_serial) { 584 mutex_enter(&rp->r_statelock); 585 rp->r_serial = NULL; 586 cv_broadcast(&rp->r_cv); 587 mutex_exit(&rp->r_statelock); 588 } 589 } 590 591 /* 592 * Use the passed in "before" virtual attributes to check to see 593 * whether the data and metadata caches are valid, cache the "after" 594 * new attributes, and then do the cache invalidation if required. 595 * 596 * The cache validation and caching of the new attributes is done 597 * atomically via the use of the mutex, r_statelock. If required, 598 * the cache invalidation is done atomically w.r.t. the cache 599 * validation and caching of the attributes via the pseudo lock, 600 * r_serial. 601 * 602 * This routine is used to do cache validation and attributes caching 603 * for operations with both pre operation attributes and post operation 604 * attributes. 605 */ 606 static void 607 nfs3_attr_cache(vnode_t *vp, vattr_t *bvap, vattr_t *avap, hrtime_t t, 608 cred_t *cr) 609 { 610 rnode_t *rp; 611 int mtime_changed; 612 int ctime_changed; 613 vsecattr_t *vsp; 614 int was_serial; 615 616 rp = VTOR(vp); 617 618 mutex_enter(&rp->r_statelock); 619 620 if (rp->r_serial != curthread) { 621 klwp_t *lwp = ttolwp(curthread); 622 623 was_serial = 0; 624 if (lwp != NULL) 625 lwp->lwp_nostop++; 626 while (rp->r_serial != NULL) { 627 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 628 mutex_exit(&rp->r_statelock); 629 if (lwp != NULL) 630 lwp->lwp_nostop--; 631 return; 632 } 633 } 634 if (lwp != NULL) 635 lwp->lwp_nostop--; 636 } else 637 was_serial = 1; 638 639 if (rp->r_mtime > t) { 640 mutex_exit(&rp->r_statelock); 641 return; 642 } 643 644 if (!(rp->r_flags & RWRITEATTR)) { 645 if (!CACHE_VALID(rp, bvap->va_mtime, bvap->va_size)) 646 mtime_changed = 1; 647 else 648 mtime_changed = 0; 649 if (rp->r_attr.va_ctime.tv_sec != bvap->va_ctime.tv_sec || 650 rp->r_attr.va_ctime.tv_nsec != bvap->va_ctime.tv_nsec) 651 ctime_changed = 1; 652 else 653 ctime_changed = 0; 654 } else { 655 mtime_changed = 0; 656 ctime_changed = 0; 657 } 658 659 nfs_attrcache_va(vp, avap); 660 661 if (!mtime_changed && !ctime_changed) { 662 mutex_exit(&rp->r_statelock); 663 return; 664 } 665 666 rp->r_serial = curthread; 667 668 mutex_exit(&rp->r_statelock); 669 670 if (mtime_changed) 671 nfs_purge_caches(vp, NFS_NOPURGE_DNLC, cr); 672 673 if (ctime_changed) { 674 (void) nfs_access_purge_rp(rp); 675 if (rp->r_secattr != NULL) { 676 mutex_enter(&rp->r_statelock); 677 vsp = rp->r_secattr; 678 rp->r_secattr = NULL; 679 mutex_exit(&rp->r_statelock); 680 if (vsp != NULL) 681 nfs_acl_free(vsp); 682 } 683 } 684 685 if (!was_serial) { 686 mutex_enter(&rp->r_statelock); 687 rp->r_serial = NULL; 688 cv_broadcast(&rp->r_cv); 689 mutex_exit(&rp->r_statelock); 690 } 691 } 692 693 /* 694 * Set attributes cache for given vnode using virtual attributes. 695 * 696 * Set the timeout value on the attribute cache and fill it 697 * with the passed in attributes. 698 * 699 * The caller must be holding r_statelock. 700 */ 701 void 702 nfs_attrcache_va(vnode_t *vp, struct vattr *va) 703 { 704 rnode_t *rp; 705 mntinfo_t *mi; 706 hrtime_t delta; 707 hrtime_t now; 708 709 rp = VTOR(vp); 710 711 ASSERT(MUTEX_HELD(&rp->r_statelock)); 712 713 now = gethrtime(); 714 715 mi = VTOMI(vp); 716 717 /* 718 * Delta is the number of nanoseconds that we will 719 * cache the attributes of the file. It is based on 720 * the number of nanoseconds since the last time that 721 * we detected a change. The assumption is that files 722 * that changed recently are likely to change again. 723 * There is a minimum and a maximum for regular files 724 * and for directories which is enforced though. 725 * 726 * Using the time since last change was detected 727 * eliminates direct comparison or calculation 728 * using mixed client and server times. NFS does 729 * not make any assumptions regarding the client 730 * and server clocks being synchronized. 731 */ 732 if (va->va_mtime.tv_sec != rp->r_attr.va_mtime.tv_sec || 733 va->va_mtime.tv_nsec != rp->r_attr.va_mtime.tv_nsec || 734 va->va_size != rp->r_attr.va_size) 735 rp->r_mtime = now; 736 737 if ((mi->mi_flags & MI_NOAC) || (vp->v_flag & VNOCACHE)) 738 delta = 0; 739 else { 740 delta = now - rp->r_mtime; 741 if (vp->v_type == VDIR) { 742 if (delta < mi->mi_acdirmin) 743 delta = mi->mi_acdirmin; 744 else if (delta > mi->mi_acdirmax) 745 delta = mi->mi_acdirmax; 746 } else { 747 if (delta < mi->mi_acregmin) 748 delta = mi->mi_acregmin; 749 else if (delta > mi->mi_acregmax) 750 delta = mi->mi_acregmax; 751 } 752 } 753 rp->r_attrtime = now + delta; 754 rp->r_attr = *va; 755 /* 756 * Update the size of the file if there is no cached data or if 757 * the cached data is clean and there is no data being written 758 * out. 759 */ 760 if (rp->r_size != va->va_size && 761 (!vn_has_cached_data(vp) || 762 (!(rp->r_flags & RDIRTY) && rp->r_count == 0))) 763 rp->r_size = va->va_size; 764 nfs_setswaplike(vp, va); 765 rp->r_flags &= ~RWRITEATTR; 766 } 767 768 /* 769 * Fill in attribute from the cache. 770 * If valid, then return 0 to indicate that no error occurred, 771 * otherwise return 1 to indicate that an error occurred. 772 */ 773 static int 774 nfs_getattr_cache(vnode_t *vp, struct vattr *vap) 775 { 776 rnode_t *rp; 777 778 rp = VTOR(vp); 779 mutex_enter(&rp->r_statelock); 780 if (ATTRCACHE_VALID(vp)) { 781 /* 782 * Cached attributes are valid 783 */ 784 *vap = rp->r_attr; 785 mutex_exit(&rp->r_statelock); 786 return (0); 787 } 788 mutex_exit(&rp->r_statelock); 789 return (1); 790 } 791 792 /* 793 * Get attributes over-the-wire and update attributes cache 794 * if no error occurred in the over-the-wire operation. 795 * Return 0 if successful, otherwise error. 796 */ 797 int 798 nfs_getattr_otw(vnode_t *vp, struct vattr *vap, cred_t *cr) 799 { 800 int error; 801 struct nfsattrstat ns; 802 int douprintf; 803 mntinfo_t *mi; 804 failinfo_t fi; 805 hrtime_t t; 806 807 mi = VTOMI(vp); 808 fi.vp = vp; 809 fi.fhp = NULL; /* no need to update, filehandle not copied */ 810 fi.copyproc = nfscopyfh; 811 fi.lookupproc = nfslookup; 812 fi.xattrdirproc = acl_getxattrdir2; 813 814 if (mi->mi_flags & MI_ACL) { 815 error = acl_getattr2_otw(vp, vap, cr); 816 if (mi->mi_flags & MI_ACL) 817 return (error); 818 } 819 820 douprintf = 1; 821 822 t = gethrtime(); 823 824 error = rfs2call(mi, RFS_GETATTR, 825 xdr_fhandle, (caddr_t)VTOFH(vp), 826 xdr_attrstat, (caddr_t)&ns, cr, 827 &douprintf, &ns.ns_status, 0, &fi); 828 829 if (!error) { 830 error = geterrno(ns.ns_status); 831 if (!error) 832 error = nfs_cache_fattr(vp, &ns.ns_attr, vap, t, cr); 833 else { 834 PURGE_STALE_FH(error, vp, cr); 835 } 836 } 837 838 return (error); 839 } 840 841 /* 842 * Return either cached ot remote attributes. If get remote attr 843 * use them to check and invalidate caches, then cache the new attributes. 844 */ 845 int 846 nfsgetattr(vnode_t *vp, struct vattr *vap, cred_t *cr) 847 { 848 int error; 849 rnode_t *rp; 850 851 /* 852 * If we've got cached attributes, we're done, otherwise go 853 * to the server to get attributes, which will update the cache 854 * in the process. 855 */ 856 error = nfs_getattr_cache(vp, vap); 857 if (error) 858 error = nfs_getattr_otw(vp, vap, cr); 859 860 /* Return the client's view of file size */ 861 rp = VTOR(vp); 862 mutex_enter(&rp->r_statelock); 863 vap->va_size = rp->r_size; 864 mutex_exit(&rp->r_statelock); 865 866 return (error); 867 } 868 869 /* 870 * Get attributes over-the-wire and update attributes cache 871 * if no error occurred in the over-the-wire operation. 872 * Return 0 if successful, otherwise error. 873 */ 874 int 875 nfs3_getattr_otw(vnode_t *vp, struct vattr *vap, cred_t *cr) 876 { 877 int error; 878 GETATTR3args args; 879 GETATTR3vres res; 880 int douprintf; 881 failinfo_t fi; 882 hrtime_t t; 883 884 args.object = *VTOFH3(vp); 885 fi.vp = vp; 886 fi.fhp = (caddr_t)&args.object; 887 fi.copyproc = nfs3copyfh; 888 fi.lookupproc = nfs3lookup; 889 fi.xattrdirproc = acl_getxattrdir3; 890 res.fres.vp = vp; 891 res.fres.vap = vap; 892 893 douprintf = 1; 894 895 t = gethrtime(); 896 897 error = rfs3call(VTOMI(vp), NFSPROC3_GETATTR, 898 xdr_nfs_fh3, (caddr_t)&args, 899 xdr_GETATTR3vres, (caddr_t)&res, cr, 900 &douprintf, &res.status, 0, &fi); 901 902 if (error) 903 return (error); 904 905 error = geterrno3(res.status); 906 if (error) { 907 PURGE_STALE_FH(error, vp, cr); 908 return (error); 909 } 910 911 /* 912 * Catch status codes that indicate fattr3 to vattr translation failure 913 */ 914 if (res.fres.status) 915 return (res.fres.status); 916 917 nfs_attr_cache(vp, vap, t, cr); 918 return (0); 919 } 920 921 /* 922 * Return either cached or remote attributes. If get remote attr 923 * use them to check and invalidate caches, then cache the new attributes. 924 */ 925 int 926 nfs3getattr(vnode_t *vp, struct vattr *vap, cred_t *cr) 927 { 928 int error; 929 rnode_t *rp; 930 931 /* 932 * If we've got cached attributes, we're done, otherwise go 933 * to the server to get attributes, which will update the cache 934 * in the process. 935 */ 936 error = nfs_getattr_cache(vp, vap); 937 if (error) 938 error = nfs3_getattr_otw(vp, vap, cr); 939 940 /* Return the client's view of file size */ 941 rp = VTOR(vp); 942 mutex_enter(&rp->r_statelock); 943 vap->va_size = rp->r_size; 944 mutex_exit(&rp->r_statelock); 945 946 return (error); 947 } 948 949 vtype_t nf_to_vt[] = { 950 VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK 951 }; 952 /* 953 * Convert NFS Version 2 over the network attributes to the local 954 * virtual attributes. The mapping between the UID_NOBODY/GID_NOBODY 955 * network representation and the local representation is done here. 956 * Returns 0 for success, error if failed due to overflow. 957 */ 958 int 959 nattr_to_vattr(vnode_t *vp, struct nfsfattr *na, struct vattr *vap) 960 { 961 /* overflow in time attributes? */ 962 #ifndef _LP64 963 if (!NFS2_FATTR_TIME_OK(na)) 964 return (EOVERFLOW); 965 #endif 966 967 if (na->na_type < NFNON || na->na_type > NFSOC) 968 vap->va_type = VBAD; 969 else 970 vap->va_type = nf_to_vt[na->na_type]; 971 vap->va_mode = na->na_mode; 972 vap->va_uid = (na->na_uid == NFS_UID_NOBODY) ? UID_NOBODY : na->na_uid; 973 vap->va_gid = (na->na_gid == NFS_GID_NOBODY) ? GID_NOBODY : na->na_gid; 974 vap->va_fsid = vp->v_vfsp->vfs_dev; 975 vap->va_nodeid = na->na_nodeid; 976 vap->va_nlink = na->na_nlink; 977 vap->va_size = na->na_size; /* keep for cache validation */ 978 /* 979 * nfs protocol defines times as unsigned so don't extend sign, 980 * unless sysadmin set nfs_allow_preepoch_time. 981 */ 982 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, na->na_atime.tv_sec); 983 vap->va_atime.tv_nsec = (uint32_t)(na->na_atime.tv_usec * 1000); 984 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, na->na_mtime.tv_sec); 985 vap->va_mtime.tv_nsec = (uint32_t)(na->na_mtime.tv_usec * 1000); 986 NFS_TIME_T_CONVERT(vap->va_ctime.tv_sec, na->na_ctime.tv_sec); 987 vap->va_ctime.tv_nsec = (uint32_t)(na->na_ctime.tv_usec * 1000); 988 /* 989 * Shannon's law - uncompress the received dev_t 990 * if the top half of is zero indicating a response 991 * from an `older style' OS. Except for when it is a 992 * `new style' OS sending the maj device of zero, 993 * in which case the algorithm still works because the 994 * fact that it is a new style server 995 * is hidden by the minor device not being greater 996 * than 255 (a requirement in this case). 997 */ 998 if ((na->na_rdev & 0xffff0000) == 0) 999 vap->va_rdev = nfsv2_expdev(na->na_rdev); 1000 else 1001 vap->va_rdev = expldev(na->na_rdev); 1002 1003 vap->va_nblocks = na->na_blocks; 1004 switch (na->na_type) { 1005 case NFBLK: 1006 vap->va_blksize = DEV_BSIZE; 1007 break; 1008 1009 case NFCHR: 1010 vap->va_blksize = MAXBSIZE; 1011 break; 1012 1013 case NFSOC: 1014 default: 1015 vap->va_blksize = na->na_blocksize; 1016 break; 1017 } 1018 /* 1019 * This bit of ugliness is a hack to preserve the 1020 * over-the-wire protocols for named-pipe vnodes. 1021 * It remaps the special over-the-wire type to the 1022 * VFIFO type. (see note in nfs.h) 1023 */ 1024 if (NA_ISFIFO(na)) { 1025 vap->va_type = VFIFO; 1026 vap->va_mode = (vap->va_mode & ~S_IFMT) | S_IFIFO; 1027 vap->va_rdev = 0; 1028 vap->va_blksize = na->na_blocksize; 1029 } 1030 vap->va_seq = 0; 1031 return (0); 1032 } 1033 1034 /* 1035 * Convert NFS Version 3 over the network attributes to the local 1036 * virtual attributes. The mapping between the UID_NOBODY/GID_NOBODY 1037 * network representation and the local representation is done here. 1038 */ 1039 vtype_t nf3_to_vt[] = { 1040 VBAD, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO 1041 }; 1042 1043 int 1044 fattr3_to_vattr(vnode_t *vp, fattr3 *na, struct vattr *vap) 1045 { 1046 1047 #ifndef _LP64 1048 /* overflow in time attributes? */ 1049 if (!NFS3_FATTR_TIME_OK(na)) 1050 return (EOVERFLOW); 1051 #endif 1052 if (!NFS3_SIZE_OK(na->size)) 1053 /* file too big */ 1054 return (EFBIG); 1055 1056 vap->va_mask = AT_ALL; 1057 1058 if (na->type < NF3REG || na->type > NF3FIFO) 1059 vap->va_type = VBAD; 1060 else 1061 vap->va_type = nf3_to_vt[na->type]; 1062 vap->va_mode = na->mode; 1063 vap->va_uid = (na->uid == NFS_UID_NOBODY) ? UID_NOBODY : (uid_t)na->uid; 1064 vap->va_gid = (na->gid == NFS_GID_NOBODY) ? GID_NOBODY : (gid_t)na->gid; 1065 vap->va_fsid = vp->v_vfsp->vfs_dev; 1066 vap->va_nodeid = na->fileid; 1067 vap->va_nlink = na->nlink; 1068 vap->va_size = na->size; 1069 1070 /* 1071 * nfs protocol defines times as unsigned so don't extend sign, 1072 * unless sysadmin set nfs_allow_preepoch_time. 1073 */ 1074 NFS_TIME_T_CONVERT(vap->va_atime.tv_sec, na->atime.seconds); 1075 vap->va_atime.tv_nsec = (uint32_t)na->atime.nseconds; 1076 NFS_TIME_T_CONVERT(vap->va_mtime.tv_sec, na->mtime.seconds); 1077 vap->va_mtime.tv_nsec = (uint32_t)na->mtime.nseconds; 1078 NFS_TIME_T_CONVERT(vap->va_ctime.tv_sec, na->ctime.seconds); 1079 vap->va_ctime.tv_nsec = (uint32_t)na->ctime.nseconds; 1080 1081 switch (na->type) { 1082 case NF3BLK: 1083 vap->va_rdev = makedevice(na->rdev.specdata1, 1084 na->rdev.specdata2); 1085 vap->va_blksize = DEV_BSIZE; 1086 vap->va_nblocks = 0; 1087 break; 1088 case NF3CHR: 1089 vap->va_rdev = makedevice(na->rdev.specdata1, 1090 na->rdev.specdata2); 1091 vap->va_blksize = MAXBSIZE; 1092 vap->va_nblocks = 0; 1093 break; 1094 case NF3REG: 1095 case NF3DIR: 1096 case NF3LNK: 1097 vap->va_rdev = 0; 1098 vap->va_blksize = MAXBSIZE; 1099 vap->va_nblocks = (u_longlong_t) 1100 ((na->used + (size3)DEV_BSIZE - (size3)1) / 1101 (size3)DEV_BSIZE); 1102 break; 1103 case NF3SOCK: 1104 case NF3FIFO: 1105 default: 1106 vap->va_rdev = 0; 1107 vap->va_blksize = MAXBSIZE; 1108 vap->va_nblocks = 0; 1109 break; 1110 } 1111 vap->va_seq = 0; 1112 return (0); 1113 } 1114 1115 /* 1116 * Asynchronous I/O parameters. nfs_async_threads is the high-water mark 1117 * for the demand-based allocation of async threads per-mount. The 1118 * nfs_async_timeout is the amount of time a thread will live after it 1119 * becomes idle, unless new I/O requests are received before the thread 1120 * dies. See nfs_async_putpage and nfs_async_start. 1121 */ 1122 1123 int nfs_async_timeout = -1; /* uninitialized */ 1124 1125 static void nfs_async_start(struct vfs *); 1126 1127 static void 1128 free_async_args(struct nfs_async_reqs *args) 1129 { 1130 rnode_t *rp; 1131 1132 if (args->a_io != NFS_INACTIVE) { 1133 rp = VTOR(args->a_vp); 1134 mutex_enter(&rp->r_statelock); 1135 rp->r_count--; 1136 if (args->a_io == NFS_PUTAPAGE || 1137 args->a_io == NFS_PAGEIO) 1138 rp->r_awcount--; 1139 cv_broadcast(&rp->r_cv); 1140 mutex_exit(&rp->r_statelock); 1141 VN_RELE(args->a_vp); 1142 } 1143 crfree(args->a_cred); 1144 kmem_free(args, sizeof (*args)); 1145 } 1146 1147 /* 1148 * Cross-zone thread creation and NFS access is disallowed, yet fsflush() and 1149 * pageout(), running in the global zone, have legitimate reasons to do 1150 * VOP_PUTPAGE(B_ASYNC) on other zones' NFS mounts. We avoid the problem by 1151 * use of a a per-mount "asynchronous requests manager thread" which is 1152 * signaled by the various asynchronous work routines when there is 1153 * asynchronous work to be done. It is responsible for creating new 1154 * worker threads if necessary, and notifying existing worker threads 1155 * that there is work to be done. 1156 * 1157 * In other words, it will "take the specifications from the customers and 1158 * give them to the engineers." 1159 * 1160 * Worker threads die off of their own accord if they are no longer 1161 * needed. 1162 * 1163 * This thread is killed when the zone is going away or the filesystem 1164 * is being unmounted. 1165 */ 1166 void 1167 nfs_async_manager(vfs_t *vfsp) 1168 { 1169 callb_cpr_t cprinfo; 1170 mntinfo_t *mi; 1171 uint_t max_threads; 1172 1173 mi = VFTOMI(vfsp); 1174 1175 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, 1176 "nfs_async_manager"); 1177 1178 mutex_enter(&mi->mi_async_lock); 1179 /* 1180 * We want to stash the max number of threads that this mount was 1181 * allowed so we can use it later when the variable is set to zero as 1182 * part of the zone/mount going away. 1183 * 1184 * We want to be able to create at least one thread to handle 1185 * asyncrhonous inactive calls. 1186 */ 1187 max_threads = MAX(mi->mi_max_threads, 1); 1188 mutex_enter(&mi->mi_lock); 1189 /* 1190 * We don't want to wait for mi_max_threads to go to zero, since that 1191 * happens as part of a failed unmount, but this thread should only 1192 * exit when the mount/zone is really going away. 1193 * 1194 * Once MI_ASYNC_MGR_STOP is set, no more async operations will be 1195 * attempted: the various _async_*() functions know to do things 1196 * inline if mi_max_threads == 0. Henceforth we just drain out the 1197 * outstanding requests. 1198 * 1199 * Note that we still create zthreads even if we notice the zone is 1200 * shutting down (MI_ASYNC_MGR_STOP is set); this may cause the zone 1201 * shutdown sequence to take slightly longer in some cases, but 1202 * doesn't violate the protocol, as all threads will exit as soon as 1203 * they're done processing the remaining requests. 1204 */ 1205 while (!(mi->mi_flags & MI_ASYNC_MGR_STOP) || 1206 mi->mi_async_req_count > 0) { 1207 mutex_exit(&mi->mi_lock); 1208 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1209 cv_wait(&mi->mi_async_reqs_cv, &mi->mi_async_lock); 1210 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 1211 while (mi->mi_async_req_count > 0) { 1212 /* 1213 * Paranoia: If the mount started out having 1214 * (mi->mi_max_threads == 0), and the value was 1215 * later changed (via a debugger or somesuch), 1216 * we could be confused since we will think we 1217 * can't create any threads, and the calling 1218 * code (which looks at the current value of 1219 * mi->mi_max_threads, now non-zero) thinks we 1220 * can. 1221 * 1222 * So, because we're paranoid, we create threads 1223 * up to the maximum of the original and the 1224 * current value. This means that future 1225 * (debugger-induced) lowerings of 1226 * mi->mi_max_threads are ignored for our 1227 * purposes, but who told them they could change 1228 * random values on a live kernel anyhow? 1229 */ 1230 if (mi->mi_threads < 1231 MAX(mi->mi_max_threads, max_threads)) { 1232 mi->mi_threads++; 1233 mutex_exit(&mi->mi_async_lock); 1234 VFS_HOLD(vfsp); /* hold for new thread */ 1235 (void) zthread_create(NULL, 0, nfs_async_start, 1236 vfsp, 0, minclsyspri); 1237 mutex_enter(&mi->mi_async_lock); 1238 } 1239 cv_signal(&mi->mi_async_work_cv); 1240 ASSERT(mi->mi_async_req_count != 0); 1241 mi->mi_async_req_count--; 1242 } 1243 mutex_enter(&mi->mi_lock); 1244 } 1245 mutex_exit(&mi->mi_lock); 1246 /* 1247 * Let everyone know we're done. 1248 */ 1249 mi->mi_manager_thread = NULL; 1250 cv_broadcast(&mi->mi_async_cv); 1251 1252 /* 1253 * There is no explicit call to mutex_exit(&mi->mi_async_lock) 1254 * since CALLB_CPR_EXIT is actually responsible for releasing 1255 * 'mi_async_lock'. 1256 */ 1257 CALLB_CPR_EXIT(&cprinfo); 1258 VFS_RELE(vfsp); /* release thread's hold */ 1259 zthread_exit(); 1260 } 1261 1262 /* 1263 * Signal (and wait for) the async manager thread to clean up and go away. 1264 */ 1265 void 1266 nfs_async_manager_stop(vfs_t *vfsp) 1267 { 1268 mntinfo_t *mi = VFTOMI(vfsp); 1269 1270 mutex_enter(&mi->mi_async_lock); 1271 mutex_enter(&mi->mi_lock); 1272 mi->mi_flags |= MI_ASYNC_MGR_STOP; 1273 mutex_exit(&mi->mi_lock); 1274 cv_broadcast(&mi->mi_async_reqs_cv); 1275 while (mi->mi_manager_thread != NULL) 1276 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 1277 mutex_exit(&mi->mi_async_lock); 1278 } 1279 1280 int 1281 nfs_async_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, 1282 struct seg *seg, cred_t *cr, void (*readahead)(vnode_t *, 1283 u_offset_t, caddr_t, struct seg *, cred_t *)) 1284 { 1285 rnode_t *rp; 1286 mntinfo_t *mi; 1287 struct nfs_async_reqs *args; 1288 1289 rp = VTOR(vp); 1290 ASSERT(rp->r_freef == NULL); 1291 1292 mi = VTOMI(vp); 1293 1294 /* 1295 * If addr falls in a different segment, don't bother doing readahead. 1296 */ 1297 if (addr >= seg->s_base + seg->s_size) 1298 return (-1); 1299 1300 /* 1301 * If we can't allocate a request structure, punt on the readahead. 1302 */ 1303 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1304 return (-1); 1305 1306 /* 1307 * If a lock operation is pending, don't initiate any new 1308 * readaheads. Otherwise, bump r_count to indicate the new 1309 * asynchronous I/O. 1310 */ 1311 if (!nfs_rw_tryenter(&rp->r_lkserlock, RW_READER)) { 1312 kmem_free(args, sizeof (*args)); 1313 return (-1); 1314 } 1315 mutex_enter(&rp->r_statelock); 1316 rp->r_count++; 1317 mutex_exit(&rp->r_statelock); 1318 nfs_rw_exit(&rp->r_lkserlock); 1319 1320 args->a_next = NULL; 1321 #ifdef DEBUG 1322 args->a_queuer = curthread; 1323 #endif 1324 VN_HOLD(vp); 1325 args->a_vp = vp; 1326 ASSERT(cr != NULL); 1327 crhold(cr); 1328 args->a_cred = cr; 1329 args->a_io = NFS_READ_AHEAD; 1330 args->a_nfs_readahead = readahead; 1331 args->a_nfs_blkoff = blkoff; 1332 args->a_nfs_seg = seg; 1333 args->a_nfs_addr = addr; 1334 1335 mutex_enter(&mi->mi_async_lock); 1336 1337 /* 1338 * If asyncio has been disabled, don't bother readahead. 1339 */ 1340 if (mi->mi_max_threads == 0) { 1341 mutex_exit(&mi->mi_async_lock); 1342 goto noasync; 1343 } 1344 1345 /* 1346 * Link request structure into the async list and 1347 * wakeup async thread to do the i/o. 1348 */ 1349 if (mi->mi_async_reqs[NFS_READ_AHEAD] == NULL) { 1350 mi->mi_async_reqs[NFS_READ_AHEAD] = args; 1351 mi->mi_async_tail[NFS_READ_AHEAD] = args; 1352 } else { 1353 mi->mi_async_tail[NFS_READ_AHEAD]->a_next = args; 1354 mi->mi_async_tail[NFS_READ_AHEAD] = args; 1355 } 1356 1357 if (mi->mi_io_kstats) { 1358 mutex_enter(&mi->mi_lock); 1359 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1360 mutex_exit(&mi->mi_lock); 1361 } 1362 1363 mi->mi_async_req_count++; 1364 ASSERT(mi->mi_async_req_count != 0); 1365 cv_signal(&mi->mi_async_reqs_cv); 1366 mutex_exit(&mi->mi_async_lock); 1367 return (0); 1368 1369 noasync: 1370 mutex_enter(&rp->r_statelock); 1371 rp->r_count--; 1372 cv_broadcast(&rp->r_cv); 1373 mutex_exit(&rp->r_statelock); 1374 VN_RELE(vp); 1375 crfree(cr); 1376 kmem_free(args, sizeof (*args)); 1377 return (-1); 1378 } 1379 1380 int 1381 nfs_async_putapage(vnode_t *vp, page_t *pp, u_offset_t off, size_t len, 1382 int flags, cred_t *cr, int (*putapage)(vnode_t *, page_t *, 1383 u_offset_t, size_t, int, cred_t *)) 1384 { 1385 rnode_t *rp; 1386 mntinfo_t *mi; 1387 struct nfs_async_reqs *args; 1388 1389 ASSERT(flags & B_ASYNC); 1390 ASSERT(vp->v_vfsp != NULL); 1391 1392 rp = VTOR(vp); 1393 ASSERT(rp->r_count > 0); 1394 1395 mi = VTOMI(vp); 1396 1397 /* 1398 * If we can't allocate a request structure, do the putpage 1399 * operation synchronously in this thread's context. 1400 */ 1401 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1402 goto noasync; 1403 1404 args->a_next = NULL; 1405 #ifdef DEBUG 1406 args->a_queuer = curthread; 1407 #endif 1408 VN_HOLD(vp); 1409 args->a_vp = vp; 1410 ASSERT(cr != NULL); 1411 crhold(cr); 1412 args->a_cred = cr; 1413 args->a_io = NFS_PUTAPAGE; 1414 args->a_nfs_putapage = putapage; 1415 args->a_nfs_pp = pp; 1416 args->a_nfs_off = off; 1417 args->a_nfs_len = (uint_t)len; 1418 args->a_nfs_flags = flags; 1419 1420 mutex_enter(&mi->mi_async_lock); 1421 1422 /* 1423 * If asyncio has been disabled, then make a synchronous request. 1424 * This check is done a second time in case async io was diabled 1425 * while this thread was blocked waiting for memory pressure to 1426 * reduce or for the queue to drain. 1427 */ 1428 if (mi->mi_max_threads == 0) { 1429 mutex_exit(&mi->mi_async_lock); 1430 goto noasync; 1431 } 1432 1433 /* 1434 * Link request structure into the async list and 1435 * wakeup async thread to do the i/o. 1436 */ 1437 if (mi->mi_async_reqs[NFS_PUTAPAGE] == NULL) { 1438 mi->mi_async_reqs[NFS_PUTAPAGE] = args; 1439 mi->mi_async_tail[NFS_PUTAPAGE] = args; 1440 } else { 1441 mi->mi_async_tail[NFS_PUTAPAGE]->a_next = args; 1442 mi->mi_async_tail[NFS_PUTAPAGE] = args; 1443 } 1444 1445 mutex_enter(&rp->r_statelock); 1446 rp->r_count++; 1447 rp->r_awcount++; 1448 mutex_exit(&rp->r_statelock); 1449 1450 if (mi->mi_io_kstats) { 1451 mutex_enter(&mi->mi_lock); 1452 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1453 mutex_exit(&mi->mi_lock); 1454 } 1455 1456 mi->mi_async_req_count++; 1457 ASSERT(mi->mi_async_req_count != 0); 1458 cv_signal(&mi->mi_async_reqs_cv); 1459 mutex_exit(&mi->mi_async_lock); 1460 return (0); 1461 1462 noasync: 1463 if (args != NULL) { 1464 VN_RELE(vp); 1465 crfree(cr); 1466 kmem_free(args, sizeof (*args)); 1467 } 1468 1469 if (curproc == proc_pageout || curproc == proc_fsflush) { 1470 /* 1471 * If we get here in the context of the pageout/fsflush, 1472 * we refuse to do a sync write, because this may hang 1473 * pageout (and the machine). In this case, we just 1474 * re-mark the page as dirty and punt on the page. 1475 * 1476 * Make sure B_FORCE isn't set. We can re-mark the 1477 * pages as dirty and unlock the pages in one swoop by 1478 * passing in B_ERROR to pvn_write_done(). However, 1479 * we should make sure B_FORCE isn't set - we don't 1480 * want the page tossed before it gets written out. 1481 */ 1482 if (flags & B_FORCE) 1483 flags &= ~(B_INVAL | B_FORCE); 1484 pvn_write_done(pp, flags | B_ERROR); 1485 return (0); 1486 } 1487 if (nfs_zone() != mi->mi_zone) { 1488 /* 1489 * So this was a cross-zone sync putpage. We pass in B_ERROR 1490 * to pvn_write_done() to re-mark the pages as dirty and unlock 1491 * them. 1492 * 1493 * We don't want to clear B_FORCE here as the caller presumably 1494 * knows what they're doing if they set it. 1495 */ 1496 pvn_write_done(pp, flags | B_ERROR); 1497 return (EPERM); 1498 } 1499 return ((*putapage)(vp, pp, off, len, flags, cr)); 1500 } 1501 1502 int 1503 nfs_async_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 1504 int flags, cred_t *cr, int (*pageio)(vnode_t *, page_t *, u_offset_t, 1505 size_t, int, cred_t *)) 1506 { 1507 rnode_t *rp; 1508 mntinfo_t *mi; 1509 struct nfs_async_reqs *args; 1510 1511 ASSERT(flags & B_ASYNC); 1512 ASSERT(vp->v_vfsp != NULL); 1513 1514 rp = VTOR(vp); 1515 ASSERT(rp->r_count > 0); 1516 1517 mi = VTOMI(vp); 1518 1519 /* 1520 * If we can't allocate a request structure, do the pageio 1521 * request synchronously in this thread's context. 1522 */ 1523 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1524 goto noasync; 1525 1526 args->a_next = NULL; 1527 #ifdef DEBUG 1528 args->a_queuer = curthread; 1529 #endif 1530 VN_HOLD(vp); 1531 args->a_vp = vp; 1532 ASSERT(cr != NULL); 1533 crhold(cr); 1534 args->a_cred = cr; 1535 args->a_io = NFS_PAGEIO; 1536 args->a_nfs_pageio = pageio; 1537 args->a_nfs_pp = pp; 1538 args->a_nfs_off = io_off; 1539 args->a_nfs_len = (uint_t)io_len; 1540 args->a_nfs_flags = flags; 1541 1542 mutex_enter(&mi->mi_async_lock); 1543 1544 /* 1545 * If asyncio has been disabled, then make a synchronous request. 1546 * This check is done a second time in case async io was diabled 1547 * while this thread was blocked waiting for memory pressure to 1548 * reduce or for the queue to drain. 1549 */ 1550 if (mi->mi_max_threads == 0) { 1551 mutex_exit(&mi->mi_async_lock); 1552 goto noasync; 1553 } 1554 1555 /* 1556 * Link request structure into the async list and 1557 * wakeup async thread to do the i/o. 1558 */ 1559 if (mi->mi_async_reqs[NFS_PAGEIO] == NULL) { 1560 mi->mi_async_reqs[NFS_PAGEIO] = args; 1561 mi->mi_async_tail[NFS_PAGEIO] = args; 1562 } else { 1563 mi->mi_async_tail[NFS_PAGEIO]->a_next = args; 1564 mi->mi_async_tail[NFS_PAGEIO] = args; 1565 } 1566 1567 mutex_enter(&rp->r_statelock); 1568 rp->r_count++; 1569 rp->r_awcount++; 1570 mutex_exit(&rp->r_statelock); 1571 1572 if (mi->mi_io_kstats) { 1573 mutex_enter(&mi->mi_lock); 1574 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1575 mutex_exit(&mi->mi_lock); 1576 } 1577 1578 mi->mi_async_req_count++; 1579 ASSERT(mi->mi_async_req_count != 0); 1580 cv_signal(&mi->mi_async_reqs_cv); 1581 mutex_exit(&mi->mi_async_lock); 1582 return (0); 1583 1584 noasync: 1585 if (args != NULL) { 1586 VN_RELE(vp); 1587 crfree(cr); 1588 kmem_free(args, sizeof (*args)); 1589 } 1590 1591 /* 1592 * If we can't do it ASYNC, for reads we do nothing (but cleanup 1593 * the page list), for writes we do it synchronously, except for 1594 * proc_pageout/proc_fsflush as described below. 1595 */ 1596 if (flags & B_READ) { 1597 pvn_read_done(pp, flags | B_ERROR); 1598 return (0); 1599 } 1600 1601 if (curproc == proc_pageout || curproc == proc_fsflush) { 1602 /* 1603 * If we get here in the context of the pageout/fsflush, 1604 * we refuse to do a sync write, because this may hang 1605 * pageout/fsflush (and the machine). In this case, we just 1606 * re-mark the page as dirty and punt on the page. 1607 * 1608 * Make sure B_FORCE isn't set. We can re-mark the 1609 * pages as dirty and unlock the pages in one swoop by 1610 * passing in B_ERROR to pvn_write_done(). However, 1611 * we should make sure B_FORCE isn't set - we don't 1612 * want the page tossed before it gets written out. 1613 */ 1614 if (flags & B_FORCE) 1615 flags &= ~(B_INVAL | B_FORCE); 1616 pvn_write_done(pp, flags | B_ERROR); 1617 return (0); 1618 } 1619 1620 if (nfs_zone() != mi->mi_zone) { 1621 /* 1622 * So this was a cross-zone sync pageio. We pass in B_ERROR 1623 * to pvn_write_done() to re-mark the pages as dirty and unlock 1624 * them. 1625 * 1626 * We don't want to clear B_FORCE here as the caller presumably 1627 * knows what they're doing if they set it. 1628 */ 1629 pvn_write_done(pp, flags | B_ERROR); 1630 return (EPERM); 1631 } 1632 return ((*pageio)(vp, pp, io_off, io_len, flags, cr)); 1633 } 1634 1635 void 1636 nfs_async_readdir(vnode_t *vp, rddir_cache *rdc, cred_t *cr, 1637 int (*readdir)(vnode_t *, rddir_cache *, cred_t *)) 1638 { 1639 rnode_t *rp; 1640 mntinfo_t *mi; 1641 struct nfs_async_reqs *args; 1642 1643 rp = VTOR(vp); 1644 ASSERT(rp->r_freef == NULL); 1645 1646 mi = VTOMI(vp); 1647 1648 /* 1649 * If we can't allocate a request structure, do the readdir 1650 * operation synchronously in this thread's context. 1651 */ 1652 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1653 goto noasync; 1654 1655 args->a_next = NULL; 1656 #ifdef DEBUG 1657 args->a_queuer = curthread; 1658 #endif 1659 VN_HOLD(vp); 1660 args->a_vp = vp; 1661 ASSERT(cr != NULL); 1662 crhold(cr); 1663 args->a_cred = cr; 1664 args->a_io = NFS_READDIR; 1665 args->a_nfs_readdir = readdir; 1666 args->a_nfs_rdc = rdc; 1667 1668 mutex_enter(&mi->mi_async_lock); 1669 1670 /* 1671 * If asyncio has been disabled, then make a synchronous request. 1672 */ 1673 if (mi->mi_max_threads == 0) { 1674 mutex_exit(&mi->mi_async_lock); 1675 goto noasync; 1676 } 1677 1678 /* 1679 * Link request structure into the async list and 1680 * wakeup async thread to do the i/o. 1681 */ 1682 if (mi->mi_async_reqs[NFS_READDIR] == NULL) { 1683 mi->mi_async_reqs[NFS_READDIR] = args; 1684 mi->mi_async_tail[NFS_READDIR] = args; 1685 } else { 1686 mi->mi_async_tail[NFS_READDIR]->a_next = args; 1687 mi->mi_async_tail[NFS_READDIR] = args; 1688 } 1689 1690 mutex_enter(&rp->r_statelock); 1691 rp->r_count++; 1692 mutex_exit(&rp->r_statelock); 1693 1694 if (mi->mi_io_kstats) { 1695 mutex_enter(&mi->mi_lock); 1696 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1697 mutex_exit(&mi->mi_lock); 1698 } 1699 1700 mi->mi_async_req_count++; 1701 ASSERT(mi->mi_async_req_count != 0); 1702 cv_signal(&mi->mi_async_reqs_cv); 1703 mutex_exit(&mi->mi_async_lock); 1704 return; 1705 1706 noasync: 1707 if (args != NULL) { 1708 VN_RELE(vp); 1709 crfree(cr); 1710 kmem_free(args, sizeof (*args)); 1711 } 1712 1713 rdc->entries = NULL; 1714 mutex_enter(&rp->r_statelock); 1715 ASSERT(rdc->flags & RDDIR); 1716 rdc->flags &= ~RDDIR; 1717 rdc->flags |= RDDIRREQ; 1718 /* 1719 * Check the flag to see if RDDIRWAIT is set. If RDDIRWAIT 1720 * is set, wakeup the thread sleeping in cv_wait_sig(). 1721 * The woken up thread will reset the flag to RDDIR and will 1722 * continue with the readdir opeartion. 1723 */ 1724 if (rdc->flags & RDDIRWAIT) { 1725 rdc->flags &= ~RDDIRWAIT; 1726 cv_broadcast(&rdc->cv); 1727 } 1728 mutex_exit(&rp->r_statelock); 1729 rddir_cache_rele(rdc); 1730 } 1731 1732 void 1733 nfs_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count, 1734 cred_t *cr, void (*commit)(vnode_t *, page_t *, offset3, count3, 1735 cred_t *)) 1736 { 1737 rnode_t *rp; 1738 mntinfo_t *mi; 1739 struct nfs_async_reqs *args; 1740 page_t *pp; 1741 1742 rp = VTOR(vp); 1743 mi = VTOMI(vp); 1744 1745 /* 1746 * If we can't allocate a request structure, do the commit 1747 * operation synchronously in this thread's context. 1748 */ 1749 if ((args = kmem_alloc(sizeof (*args), KM_NOSLEEP)) == NULL) 1750 goto noasync; 1751 1752 args->a_next = NULL; 1753 #ifdef DEBUG 1754 args->a_queuer = curthread; 1755 #endif 1756 VN_HOLD(vp); 1757 args->a_vp = vp; 1758 ASSERT(cr != NULL); 1759 crhold(cr); 1760 args->a_cred = cr; 1761 args->a_io = NFS_COMMIT; 1762 args->a_nfs_commit = commit; 1763 args->a_nfs_plist = plist; 1764 args->a_nfs_offset = offset; 1765 args->a_nfs_count = count; 1766 1767 mutex_enter(&mi->mi_async_lock); 1768 1769 /* 1770 * If asyncio has been disabled, then make a synchronous request. 1771 * This check is done a second time in case async io was diabled 1772 * while this thread was blocked waiting for memory pressure to 1773 * reduce or for the queue to drain. 1774 */ 1775 if (mi->mi_max_threads == 0) { 1776 mutex_exit(&mi->mi_async_lock); 1777 goto noasync; 1778 } 1779 1780 /* 1781 * Link request structure into the async list and 1782 * wakeup async thread to do the i/o. 1783 */ 1784 if (mi->mi_async_reqs[NFS_COMMIT] == NULL) { 1785 mi->mi_async_reqs[NFS_COMMIT] = args; 1786 mi->mi_async_tail[NFS_COMMIT] = args; 1787 } else { 1788 mi->mi_async_tail[NFS_COMMIT]->a_next = args; 1789 mi->mi_async_tail[NFS_COMMIT] = args; 1790 } 1791 1792 mutex_enter(&rp->r_statelock); 1793 rp->r_count++; 1794 mutex_exit(&rp->r_statelock); 1795 1796 if (mi->mi_io_kstats) { 1797 mutex_enter(&mi->mi_lock); 1798 kstat_waitq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 1799 mutex_exit(&mi->mi_lock); 1800 } 1801 1802 mi->mi_async_req_count++; 1803 ASSERT(mi->mi_async_req_count != 0); 1804 cv_signal(&mi->mi_async_reqs_cv); 1805 mutex_exit(&mi->mi_async_lock); 1806 return; 1807 1808 noasync: 1809 if (args != NULL) { 1810 VN_RELE(vp); 1811 crfree(cr); 1812 kmem_free(args, sizeof (*args)); 1813 } 1814 1815 if (curproc == proc_pageout || curproc == proc_fsflush || 1816 nfs_zone() != mi->mi_zone) { 1817 while (plist != NULL) { 1818 pp = plist; 1819 page_sub(&plist, pp); 1820 pp->p_fsdata = C_COMMIT; 1821 page_unlock(pp); 1822 } 1823 return; 1824 } 1825 (*commit)(vp, plist, offset, count, cr); 1826 } 1827 1828 void 1829 nfs_async_inactive(vnode_t *vp, cred_t *cr, 1830 void (*inactive)(vnode_t *, cred_t *)) 1831 { 1832 mntinfo_t *mi; 1833 struct nfs_async_reqs *args; 1834 1835 mi = VTOMI(vp); 1836 1837 args = kmem_alloc(sizeof (*args), KM_SLEEP); 1838 args->a_next = NULL; 1839 #ifdef DEBUG 1840 args->a_queuer = curthread; 1841 #endif 1842 args->a_vp = vp; 1843 ASSERT(cr != NULL); 1844 crhold(cr); 1845 args->a_cred = cr; 1846 args->a_io = NFS_INACTIVE; 1847 args->a_nfs_inactive = inactive; 1848 1849 /* 1850 * Note that we don't check mi->mi_max_threads here, since we 1851 * *need* to get rid of this vnode regardless of whether someone 1852 * set nfs3_max_threads/nfs_max_threads to zero in /etc/system. 1853 * 1854 * The manager thread knows about this and is willing to create 1855 * at least one thread to accomodate us. 1856 */ 1857 mutex_enter(&mi->mi_async_lock); 1858 if (mi->mi_manager_thread == NULL) { 1859 rnode_t *rp = VTOR(vp); 1860 1861 mutex_exit(&mi->mi_async_lock); 1862 crfree(cr); /* drop our reference */ 1863 kmem_free(args, sizeof (*args)); 1864 /* 1865 * We can't do an over-the-wire call since we're in the wrong 1866 * zone, so we need to clean up state as best we can and then 1867 * throw away the vnode. 1868 */ 1869 mutex_enter(&rp->r_statelock); 1870 if (rp->r_unldvp != NULL) { 1871 vnode_t *unldvp; 1872 char *unlname; 1873 cred_t *unlcred; 1874 1875 unldvp = rp->r_unldvp; 1876 rp->r_unldvp = NULL; 1877 unlname = rp->r_unlname; 1878 rp->r_unlname = NULL; 1879 unlcred = rp->r_unlcred; 1880 rp->r_unlcred = NULL; 1881 mutex_exit(&rp->r_statelock); 1882 1883 VN_RELE(unldvp); 1884 kmem_free(unlname, MAXNAMELEN); 1885 crfree(unlcred); 1886 } else { 1887 mutex_exit(&rp->r_statelock); 1888 } 1889 /* 1890 * No need to explicitly throw away any cached pages. The 1891 * eventual rinactive() will attempt a synchronous 1892 * VOP_PUTPAGE() which will immediately fail since the request 1893 * is coming from the wrong zone, and then will proceed to call 1894 * nfs_invalidate_pages() which will clean things up for us. 1895 */ 1896 rp_addfree(VTOR(vp), cr); 1897 return; 1898 } 1899 1900 if (mi->mi_async_reqs[NFS_INACTIVE] == NULL) { 1901 mi->mi_async_reqs[NFS_INACTIVE] = args; 1902 } else { 1903 mi->mi_async_tail[NFS_INACTIVE]->a_next = args; 1904 } 1905 mi->mi_async_tail[NFS_INACTIVE] = args; 1906 /* 1907 * Don't increment r_count, since we're trying to get rid of the vnode. 1908 */ 1909 1910 mi->mi_async_req_count++; 1911 ASSERT(mi->mi_async_req_count != 0); 1912 cv_signal(&mi->mi_async_reqs_cv); 1913 mutex_exit(&mi->mi_async_lock); 1914 } 1915 1916 /* 1917 * The async queues for each mounted file system are arranged as a 1918 * set of queues, one for each async i/o type. Requests are taken 1919 * from the queues in a round-robin fashion. A number of consecutive 1920 * requests are taken from each queue before moving on to the next 1921 * queue. This functionality may allow the NFS Version 2 server to do 1922 * write clustering, even if the client is mixing writes and reads 1923 * because it will take multiple write requests from the queue 1924 * before processing any of the other async i/o types. 1925 * 1926 * XXX The nfs_async_start thread is unsafe in the light of the present 1927 * model defined by cpr to suspend the system. Specifically over the 1928 * wire calls are cpr-unsafe. The thread should be reevaluated in 1929 * case of future updates to the cpr model. 1930 */ 1931 static void 1932 nfs_async_start(struct vfs *vfsp) 1933 { 1934 struct nfs_async_reqs *args; 1935 mntinfo_t *mi = VFTOMI(vfsp); 1936 clock_t time_left = 1; 1937 callb_cpr_t cprinfo; 1938 int i; 1939 1940 /* 1941 * Dynamic initialization of nfs_async_timeout to allow nfs to be 1942 * built in an implementation independent manner. 1943 */ 1944 if (nfs_async_timeout == -1) 1945 nfs_async_timeout = NFS_ASYNC_TIMEOUT; 1946 1947 CALLB_CPR_INIT(&cprinfo, &mi->mi_async_lock, callb_generic_cpr, "nas"); 1948 1949 mutex_enter(&mi->mi_async_lock); 1950 for (;;) { 1951 /* 1952 * Find the next queue containing an entry. We start 1953 * at the current queue pointer and then round robin 1954 * through all of them until we either find a non-empty 1955 * queue or have looked through all of them. 1956 */ 1957 for (i = 0; i < NFS_ASYNC_TYPES; i++) { 1958 args = *mi->mi_async_curr; 1959 if (args != NULL) 1960 break; 1961 mi->mi_async_curr++; 1962 if (mi->mi_async_curr == 1963 &mi->mi_async_reqs[NFS_ASYNC_TYPES]) 1964 mi->mi_async_curr = &mi->mi_async_reqs[0]; 1965 } 1966 /* 1967 * If we didn't find a entry, then block until woken up 1968 * again and then look through the queues again. 1969 */ 1970 if (args == NULL) { 1971 /* 1972 * Exiting is considered to be safe for CPR as well 1973 */ 1974 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1975 1976 /* 1977 * Wakeup thread waiting to unmount the file 1978 * system only if all async threads are inactive. 1979 * 1980 * If we've timed-out and there's nothing to do, 1981 * then get rid of this thread. 1982 */ 1983 if (mi->mi_max_threads == 0 || time_left <= 0) { 1984 if (--mi->mi_threads == 0) 1985 cv_signal(&mi->mi_async_cv); 1986 CALLB_CPR_EXIT(&cprinfo); 1987 VFS_RELE(vfsp); /* release thread's hold */ 1988 zthread_exit(); 1989 /* NOTREACHED */ 1990 } 1991 time_left = cv_timedwait(&mi->mi_async_work_cv, 1992 &mi->mi_async_lock, nfs_async_timeout + lbolt); 1993 1994 CALLB_CPR_SAFE_END(&cprinfo, &mi->mi_async_lock); 1995 1996 continue; 1997 } 1998 time_left = 1; 1999 2000 /* 2001 * Remove the request from the async queue and then 2002 * update the current async request queue pointer. If 2003 * the current queue is empty or we have removed enough 2004 * consecutive entries from it, then reset the counter 2005 * for this queue and then move the current pointer to 2006 * the next queue. 2007 */ 2008 *mi->mi_async_curr = args->a_next; 2009 if (*mi->mi_async_curr == NULL || 2010 --mi->mi_async_clusters[args->a_io] == 0) { 2011 mi->mi_async_clusters[args->a_io] = 2012 mi->mi_async_init_clusters; 2013 mi->mi_async_curr++; 2014 if (mi->mi_async_curr == 2015 &mi->mi_async_reqs[NFS_ASYNC_TYPES]) 2016 mi->mi_async_curr = &mi->mi_async_reqs[0]; 2017 } 2018 2019 if (args->a_io != NFS_INACTIVE && mi->mi_io_kstats) { 2020 mutex_enter(&mi->mi_lock); 2021 kstat_waitq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 2022 mutex_exit(&mi->mi_lock); 2023 } 2024 2025 mutex_exit(&mi->mi_async_lock); 2026 2027 /* 2028 * Obtain arguments from the async request structure. 2029 */ 2030 if (args->a_io == NFS_READ_AHEAD && mi->mi_max_threads > 0) { 2031 (*args->a_nfs_readahead)(args->a_vp, args->a_nfs_blkoff, 2032 args->a_nfs_addr, args->a_nfs_seg, 2033 args->a_cred); 2034 } else if (args->a_io == NFS_PUTAPAGE) { 2035 (void) (*args->a_nfs_putapage)(args->a_vp, 2036 args->a_nfs_pp, args->a_nfs_off, 2037 args->a_nfs_len, args->a_nfs_flags, 2038 args->a_cred); 2039 } else if (args->a_io == NFS_PAGEIO) { 2040 (void) (*args->a_nfs_pageio)(args->a_vp, 2041 args->a_nfs_pp, args->a_nfs_off, 2042 args->a_nfs_len, args->a_nfs_flags, 2043 args->a_cred); 2044 } else if (args->a_io == NFS_READDIR) { 2045 (void) ((*args->a_nfs_readdir)(args->a_vp, 2046 args->a_nfs_rdc, args->a_cred)); 2047 } else if (args->a_io == NFS_COMMIT) { 2048 (*args->a_nfs_commit)(args->a_vp, args->a_nfs_plist, 2049 args->a_nfs_offset, args->a_nfs_count, 2050 args->a_cred); 2051 } else if (args->a_io == NFS_INACTIVE) { 2052 (*args->a_nfs_inactive)(args->a_vp, args->a_cred); 2053 } 2054 2055 /* 2056 * Now, release the vnode and free the credentials 2057 * structure. 2058 */ 2059 free_async_args(args); 2060 /* 2061 * Reacquire the mutex because it will be needed above. 2062 */ 2063 mutex_enter(&mi->mi_async_lock); 2064 } 2065 } 2066 2067 void 2068 nfs_async_stop(struct vfs *vfsp) 2069 { 2070 mntinfo_t *mi = VFTOMI(vfsp); 2071 2072 /* 2073 * Wait for all outstanding async operations to complete and for the 2074 * worker threads to exit. 2075 */ 2076 mutex_enter(&mi->mi_async_lock); 2077 mi->mi_max_threads = 0; 2078 cv_broadcast(&mi->mi_async_work_cv); 2079 while (mi->mi_threads != 0) 2080 cv_wait(&mi->mi_async_cv, &mi->mi_async_lock); 2081 mutex_exit(&mi->mi_async_lock); 2082 } 2083 2084 /* 2085 * nfs_async_stop_sig: 2086 * Wait for all outstanding putpage operation to complete. If a signal 2087 * is deliver we will abort and return non-zero. If we can put all the 2088 * pages we will return 0. This routine is called from nfs_unmount and 2089 * nfs3_unmount to make these operations interruptable. 2090 */ 2091 int 2092 nfs_async_stop_sig(struct vfs *vfsp) 2093 { 2094 mntinfo_t *mi = VFTOMI(vfsp); 2095 ushort_t omax; 2096 int rval; 2097 2098 /* 2099 * Wait for all outstanding async operations to complete and for the 2100 * worker threads to exit. 2101 */ 2102 mutex_enter(&mi->mi_async_lock); 2103 omax = mi->mi_max_threads; 2104 mi->mi_max_threads = 0; 2105 /* 2106 * Tell all the worker threads to exit. 2107 */ 2108 cv_broadcast(&mi->mi_async_work_cv); 2109 while (mi->mi_threads != 0) { 2110 if (!cv_wait_sig(&mi->mi_async_cv, &mi->mi_async_lock)) 2111 break; 2112 } 2113 rval = (mi->mi_threads != 0); /* Interrupted */ 2114 if (rval) 2115 mi->mi_max_threads = omax; 2116 mutex_exit(&mi->mi_async_lock); 2117 2118 return (rval); 2119 } 2120 2121 int 2122 writerp(rnode_t *rp, caddr_t base, int tcount, struct uio *uio, int pgcreated) 2123 { 2124 int pagecreate; 2125 int n; 2126 int saved_n; 2127 caddr_t saved_base; 2128 u_offset_t offset; 2129 int error; 2130 int sm_error; 2131 2132 ASSERT(tcount <= MAXBSIZE && tcount <= uio->uio_resid); 2133 ASSERT(((uintptr_t)base & MAXBOFFSET) + tcount <= MAXBSIZE); 2134 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_WRITER)); 2135 2136 /* 2137 * Move bytes in at most PAGESIZE chunks. We must avoid 2138 * spanning pages in uiomove() because page faults may cause 2139 * the cache to be invalidated out from under us. The r_size is not 2140 * updated until after the uiomove. If we push the last page of a 2141 * file before r_size is correct, we will lose the data written past 2142 * the current (and invalid) r_size. 2143 */ 2144 do { 2145 offset = uio->uio_loffset; 2146 pagecreate = 0; 2147 2148 /* 2149 * n is the number of bytes required to satisfy the request 2150 * or the number of bytes to fill out the page. 2151 */ 2152 n = (int)MIN((PAGESIZE - ((uintptr_t)base & PAGEOFFSET)), 2153 tcount); 2154 2155 /* 2156 * Check to see if we can skip reading in the page 2157 * and just allocate the memory. We can do this 2158 * if we are going to rewrite the entire mapping 2159 * or if we are going to write to or beyond the current 2160 * end of file from the beginning of the mapping. 2161 * 2162 * The read of r_size is now protected by r_statelock. 2163 */ 2164 mutex_enter(&rp->r_statelock); 2165 /* 2166 * When pgcreated is nonzero the caller has already done 2167 * a segmap_getmapflt with forcefault 0 and S_WRITE. With 2168 * segkpm this means we already have at least one page 2169 * created and mapped at base. 2170 */ 2171 pagecreate = pgcreated || 2172 (((uintptr_t)base & PAGEOFFSET) == 0 && 2173 (n == PAGESIZE || ((offset + n) >= rp->r_size))); 2174 2175 mutex_exit(&rp->r_statelock); 2176 if (pagecreate) { 2177 /* 2178 * The last argument tells segmap_pagecreate() to 2179 * always lock the page, as opposed to sometimes 2180 * returning with the page locked. This way we avoid a 2181 * fault on the ensuing uiomove(), but also 2182 * more importantly (to fix bug 1094402) we can 2183 * call segmap_fault() to unlock the page in all 2184 * cases. An alternative would be to modify 2185 * segmap_pagecreate() to tell us when it is 2186 * locking a page, but that's a fairly major 2187 * interface change. 2188 */ 2189 if (pgcreated == 0) 2190 (void) segmap_pagecreate(segkmap, base, 2191 (uint_t)n, 1); 2192 saved_base = base; 2193 saved_n = n; 2194 } 2195 2196 /* 2197 * The number of bytes of data in the last page can not 2198 * be accurately be determined while page is being 2199 * uiomove'd to and the size of the file being updated. 2200 * Thus, inform threads which need to know accurately 2201 * how much data is in the last page of the file. They 2202 * will not do the i/o immediately, but will arrange for 2203 * the i/o to happen later when this modify operation 2204 * will have finished. 2205 */ 2206 ASSERT(!(rp->r_flags & RMODINPROGRESS)); 2207 mutex_enter(&rp->r_statelock); 2208 rp->r_flags |= RMODINPROGRESS; 2209 rp->r_modaddr = (offset & MAXBMASK); 2210 mutex_exit(&rp->r_statelock); 2211 2212 error = uiomove(base, n, UIO_WRITE, uio); 2213 2214 /* 2215 * r_size is the maximum number of 2216 * bytes known to be in the file. 2217 * Make sure it is at least as high as the 2218 * first unwritten byte pointed to by uio_loffset. 2219 */ 2220 mutex_enter(&rp->r_statelock); 2221 if (rp->r_size < uio->uio_loffset) 2222 rp->r_size = uio->uio_loffset; 2223 rp->r_flags &= ~RMODINPROGRESS; 2224 rp->r_flags |= RDIRTY; 2225 mutex_exit(&rp->r_statelock); 2226 2227 /* n = # of bytes written */ 2228 n = (int)(uio->uio_loffset - offset); 2229 base += n; 2230 tcount -= n; 2231 /* 2232 * If we created pages w/o initializing them completely, 2233 * we need to zero the part that wasn't set up. 2234 * This happens on a most EOF write cases and if 2235 * we had some sort of error during the uiomove. 2236 */ 2237 if (pagecreate) { 2238 if ((uio->uio_loffset & PAGEOFFSET) || n == 0) 2239 (void) kzero(base, PAGESIZE - n); 2240 2241 if (pgcreated) { 2242 /* 2243 * Caller is responsible for this page, 2244 * it was not created in this loop. 2245 */ 2246 pgcreated = 0; 2247 } else { 2248 /* 2249 * For bug 1094402: segmap_pagecreate locks 2250 * page. Unlock it. This also unlocks the 2251 * pages allocated by page_create_va() in 2252 * segmap_pagecreate(). 2253 */ 2254 sm_error = segmap_fault(kas.a_hat, segkmap, 2255 saved_base, saved_n, 2256 F_SOFTUNLOCK, S_WRITE); 2257 if (error == 0) 2258 error = sm_error; 2259 } 2260 } 2261 } while (tcount > 0 && error == 0); 2262 2263 return (error); 2264 } 2265 2266 int 2267 nfs_putpages(vnode_t *vp, u_offset_t off, size_t len, int flags, cred_t *cr) 2268 { 2269 rnode_t *rp; 2270 page_t *pp; 2271 u_offset_t eoff; 2272 u_offset_t io_off; 2273 size_t io_len; 2274 int error; 2275 int rdirty; 2276 int err; 2277 2278 rp = VTOR(vp); 2279 ASSERT(rp->r_count > 0); 2280 2281 if (!vn_has_cached_data(vp)) 2282 return (0); 2283 2284 ASSERT(vp->v_type != VCHR); 2285 2286 /* 2287 * If ROUTOFSPACE is set, then all writes turn into B_INVAL 2288 * writes. B_FORCE is set to force the VM system to actually 2289 * invalidate the pages, even if the i/o failed. The pages 2290 * need to get invalidated because they can't be written out 2291 * because there isn't any space left on either the server's 2292 * file system or in the user's disk quota. The B_FREE bit 2293 * is cleared to avoid confusion as to whether this is a 2294 * request to place the page on the freelist or to destroy 2295 * it. 2296 */ 2297 if ((rp->r_flags & ROUTOFSPACE) || 2298 (vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) 2299 flags = (flags & ~B_FREE) | B_INVAL | B_FORCE; 2300 2301 if (len == 0) { 2302 /* 2303 * If doing a full file synchronous operation, then clear 2304 * the RDIRTY bit. If a page gets dirtied while the flush 2305 * is happening, then RDIRTY will get set again. The 2306 * RDIRTY bit must get cleared before the flush so that 2307 * we don't lose this information. 2308 */ 2309 if (off == (u_offset_t)0 && 2310 !(flags & B_ASYNC) && 2311 (rp->r_flags & RDIRTY)) { 2312 mutex_enter(&rp->r_statelock); 2313 rdirty = (rp->r_flags & RDIRTY); 2314 rp->r_flags &= ~RDIRTY; 2315 mutex_exit(&rp->r_statelock); 2316 } else 2317 rdirty = 0; 2318 2319 /* 2320 * Search the entire vp list for pages >= off, and flush 2321 * the dirty pages. 2322 */ 2323 error = pvn_vplist_dirty(vp, off, rp->r_putapage, 2324 flags, cr); 2325 2326 /* 2327 * If an error occured and the file was marked as dirty 2328 * before and we aren't forcibly invalidating pages, then 2329 * reset the RDIRTY flag. 2330 */ 2331 if (error && rdirty && 2332 (flags & (B_INVAL | B_FORCE)) != (B_INVAL | B_FORCE)) { 2333 mutex_enter(&rp->r_statelock); 2334 rp->r_flags |= RDIRTY; 2335 mutex_exit(&rp->r_statelock); 2336 } 2337 } else { 2338 /* 2339 * Do a range from [off...off + len) looking for pages 2340 * to deal with. 2341 */ 2342 error = 0; 2343 #ifdef lint 2344 io_len = 0; 2345 #endif 2346 eoff = off + len; 2347 mutex_enter(&rp->r_statelock); 2348 for (io_off = off; io_off < eoff && io_off < rp->r_size; 2349 io_off += io_len) { 2350 mutex_exit(&rp->r_statelock); 2351 /* 2352 * If we are not invalidating, synchronously 2353 * freeing or writing pages use the routine 2354 * page_lookup_nowait() to prevent reclaiming 2355 * them from the free list. 2356 */ 2357 if ((flags & B_INVAL) || !(flags & B_ASYNC)) { 2358 pp = page_lookup(vp, io_off, 2359 (flags & (B_INVAL | B_FREE)) ? 2360 SE_EXCL : SE_SHARED); 2361 } else { 2362 pp = page_lookup_nowait(vp, io_off, 2363 (flags & B_FREE) ? SE_EXCL : SE_SHARED); 2364 } 2365 2366 if (pp == NULL || !pvn_getdirty(pp, flags)) 2367 io_len = PAGESIZE; 2368 else { 2369 err = (*rp->r_putapage)(vp, pp, &io_off, 2370 &io_len, flags, cr); 2371 if (!error) 2372 error = err; 2373 /* 2374 * "io_off" and "io_len" are returned as 2375 * the range of pages we actually wrote. 2376 * This allows us to skip ahead more quickly 2377 * since several pages may've been dealt 2378 * with by this iteration of the loop. 2379 */ 2380 } 2381 mutex_enter(&rp->r_statelock); 2382 } 2383 mutex_exit(&rp->r_statelock); 2384 } 2385 2386 return (error); 2387 } 2388 2389 void 2390 nfs_invalidate_pages(vnode_t *vp, u_offset_t off, cred_t *cr) 2391 { 2392 rnode_t *rp; 2393 2394 rp = VTOR(vp); 2395 mutex_enter(&rp->r_statelock); 2396 while (rp->r_flags & RTRUNCATE) 2397 cv_wait(&rp->r_cv, &rp->r_statelock); 2398 rp->r_flags |= RTRUNCATE; 2399 if (off == (u_offset_t)0) { 2400 rp->r_flags &= ~RDIRTY; 2401 if (!(rp->r_flags & RSTALE)) 2402 rp->r_error = 0; 2403 } 2404 rp->r_truncaddr = off; 2405 mutex_exit(&rp->r_statelock); 2406 (void) pvn_vplist_dirty(vp, off, rp->r_putapage, 2407 B_INVAL | B_TRUNC, cr); 2408 mutex_enter(&rp->r_statelock); 2409 rp->r_flags &= ~RTRUNCATE; 2410 cv_broadcast(&rp->r_cv); 2411 mutex_exit(&rp->r_statelock); 2412 } 2413 2414 static int nfs_write_error_to_cons_only = 0; 2415 #define MSG(x) (nfs_write_error_to_cons_only ? (x) : (x) + 1) 2416 2417 /* 2418 * Print a file handle 2419 */ 2420 void 2421 nfs_printfhandle(nfs_fhandle *fhp) 2422 { 2423 int *ip; 2424 char *buf; 2425 size_t bufsize; 2426 char *cp; 2427 2428 /* 2429 * 13 == "(file handle:" 2430 * maximum of NFS_FHANDLE / sizeof (*ip) elements in fh_buf times 2431 * 1 == ' ' 2432 * 8 == maximum strlen of "%x" 2433 * 3 == ")\n\0" 2434 */ 2435 bufsize = 13 + ((NFS_FHANDLE_LEN / sizeof (*ip)) * (1 + 8)) + 3; 2436 buf = kmem_alloc(bufsize, KM_NOSLEEP); 2437 if (buf == NULL) 2438 return; 2439 2440 cp = buf; 2441 (void) strcpy(cp, "(file handle:"); 2442 while (*cp != '\0') 2443 cp++; 2444 for (ip = (int *)fhp->fh_buf; 2445 ip < (int *)&fhp->fh_buf[fhp->fh_len]; 2446 ip++) { 2447 (void) sprintf(cp, " %x", *ip); 2448 while (*cp != '\0') 2449 cp++; 2450 } 2451 (void) strcpy(cp, ")\n"); 2452 2453 zcmn_err(getzoneid(), CE_CONT, MSG("^%s"), buf); 2454 2455 kmem_free(buf, bufsize); 2456 } 2457 2458 /* 2459 * Notify the system administrator that an NFS write error has 2460 * occurred. 2461 */ 2462 2463 /* seconds between ENOSPC/EDQUOT messages */ 2464 clock_t nfs_write_error_interval = 5; 2465 2466 void 2467 nfs_write_error(vnode_t *vp, int error, cred_t *cr) 2468 { 2469 mntinfo_t *mi; 2470 2471 mi = VTOMI(vp); 2472 /* 2473 * In case of forced unmount or zone shutdown, do not print any 2474 * messages since it can flood the console with error messages. 2475 */ 2476 if (FS_OR_ZONE_GONE(mi->mi_vfsp)) 2477 return; 2478 2479 /* 2480 * No use in flooding the console with ENOSPC 2481 * messages from the same file system. 2482 */ 2483 if ((error != ENOSPC && error != EDQUOT) || 2484 lbolt - mi->mi_printftime > 0) { 2485 zoneid_t zoneid = mi->mi_zone->zone_id; 2486 2487 #ifdef DEBUG 2488 nfs_perror(error, "NFS%ld write error on host %s: %m.\n", 2489 mi->mi_vers, VTOR(vp)->r_server->sv_hostname, NULL); 2490 #else 2491 nfs_perror(error, "NFS write error on host %s: %m.\n", 2492 VTOR(vp)->r_server->sv_hostname, NULL); 2493 #endif 2494 if (error == ENOSPC || error == EDQUOT) { 2495 zcmn_err(zoneid, CE_CONT, 2496 MSG("^File: userid=%d, groupid=%d\n"), 2497 crgetuid(cr), crgetgid(cr)); 2498 if (crgetuid(CRED()) != crgetuid(cr) || 2499 crgetgid(CRED()) != crgetgid(cr)) { 2500 zcmn_err(zoneid, CE_CONT, 2501 MSG("^User: userid=%d, groupid=%d\n"), 2502 crgetuid(CRED()), crgetgid(CRED())); 2503 } 2504 mi->mi_printftime = lbolt + 2505 nfs_write_error_interval * hz; 2506 } 2507 nfs_printfhandle(&VTOR(vp)->r_fh); 2508 #ifdef DEBUG 2509 if (error == EACCES) { 2510 zcmn_err(zoneid, CE_CONT, 2511 MSG("^nfs_bio: cred is%s kcred\n"), 2512 cr == kcred ? "" : " not"); 2513 } 2514 #endif 2515 } 2516 } 2517 2518 /* ARGSUSED */ 2519 static void * 2520 nfs_mi_init(zoneid_t zoneid) 2521 { 2522 struct mi_globals *mig; 2523 2524 mig = kmem_alloc(sizeof (*mig), KM_SLEEP); 2525 mutex_init(&mig->mig_lock, NULL, MUTEX_DEFAULT, NULL); 2526 list_create(&mig->mig_list, sizeof (mntinfo_t), 2527 offsetof(mntinfo_t, mi_zone_node)); 2528 mig->mig_destructor_called = B_FALSE; 2529 return (mig); 2530 } 2531 2532 /* 2533 * Callback routine to tell all NFS mounts in the zone to stop creating new 2534 * threads. Existing threads should exit. 2535 */ 2536 /* ARGSUSED */ 2537 static void 2538 nfs_mi_shutdown(zoneid_t zoneid, void *data) 2539 { 2540 struct mi_globals *mig = data; 2541 mntinfo_t *mi; 2542 2543 ASSERT(mig != NULL); 2544 again: 2545 mutex_enter(&mig->mig_lock); 2546 for (mi = list_head(&mig->mig_list); mi != NULL; 2547 mi = list_next(&mig->mig_list, mi)) { 2548 2549 /* 2550 * If we've done the shutdown work for this FS, skip. 2551 * Once we go off the end of the list, we're done. 2552 */ 2553 if (mi->mi_flags & MI_DEAD) 2554 continue; 2555 2556 /* 2557 * We will do work, so not done. Get a hold on the FS. 2558 */ 2559 VFS_HOLD(mi->mi_vfsp); 2560 2561 /* 2562 * purge the DNLC for this filesystem 2563 */ 2564 (void) dnlc_purge_vfsp(mi->mi_vfsp, 0); 2565 2566 mutex_enter(&mi->mi_async_lock); 2567 /* 2568 * Tell existing async worker threads to exit. 2569 */ 2570 mi->mi_max_threads = 0; 2571 cv_broadcast(&mi->mi_async_work_cv); 2572 /* 2573 * Set MI_ASYNC_MGR_STOP so the async manager thread starts 2574 * getting ready to exit when it's done with its current work. 2575 * Also set MI_DEAD to note we've acted on this FS. 2576 */ 2577 mutex_enter(&mi->mi_lock); 2578 mi->mi_flags |= (MI_ASYNC_MGR_STOP|MI_DEAD); 2579 mutex_exit(&mi->mi_lock); 2580 /* 2581 * Wake up the async manager thread. 2582 */ 2583 cv_broadcast(&mi->mi_async_reqs_cv); 2584 mutex_exit(&mi->mi_async_lock); 2585 2586 /* 2587 * Drop lock and release FS, which may change list, then repeat. 2588 * We're done when every mi has been done or the list is empty. 2589 */ 2590 mutex_exit(&mig->mig_lock); 2591 VFS_RELE(mi->mi_vfsp); 2592 goto again; 2593 } 2594 mutex_exit(&mig->mig_lock); 2595 } 2596 2597 static void 2598 nfs_mi_free_globals(struct mi_globals *mig) 2599 { 2600 list_destroy(&mig->mig_list); /* makes sure the list is empty */ 2601 mutex_destroy(&mig->mig_lock); 2602 kmem_free(mig, sizeof (*mig)); 2603 2604 } 2605 2606 /* ARGSUSED */ 2607 static void 2608 nfs_mi_destroy(zoneid_t zoneid, void *data) 2609 { 2610 struct mi_globals *mig = data; 2611 2612 ASSERT(mig != NULL); 2613 mutex_enter(&mig->mig_lock); 2614 if (list_head(&mig->mig_list) != NULL) { 2615 /* Still waiting for VFS_FREEVFS() */ 2616 mig->mig_destructor_called = B_TRUE; 2617 mutex_exit(&mig->mig_lock); 2618 return; 2619 } 2620 nfs_mi_free_globals(mig); 2621 } 2622 2623 /* 2624 * Add an NFS mount to the per-zone list of NFS mounts. 2625 */ 2626 void 2627 nfs_mi_zonelist_add(mntinfo_t *mi) 2628 { 2629 struct mi_globals *mig; 2630 2631 mig = zone_getspecific(mi_list_key, mi->mi_zone); 2632 mutex_enter(&mig->mig_lock); 2633 list_insert_head(&mig->mig_list, mi); 2634 mutex_exit(&mig->mig_lock); 2635 } 2636 2637 /* 2638 * Remove an NFS mount from the per-zone list of NFS mounts. 2639 */ 2640 static void 2641 nfs_mi_zonelist_remove(mntinfo_t *mi) 2642 { 2643 struct mi_globals *mig; 2644 2645 mig = zone_getspecific(mi_list_key, mi->mi_zone); 2646 mutex_enter(&mig->mig_lock); 2647 list_remove(&mig->mig_list, mi); 2648 /* 2649 * We can be called asynchronously by VFS_FREEVFS() after the zone 2650 * shutdown/destroy callbacks have executed; if so, clean up the zone's 2651 * mi globals. 2652 */ 2653 if (list_head(&mig->mig_list) == NULL && 2654 mig->mig_destructor_called == B_TRUE) { 2655 nfs_mi_free_globals(mig); 2656 return; 2657 } 2658 mutex_exit(&mig->mig_lock); 2659 } 2660 2661 /* 2662 * NFS Client initialization routine. This routine should only be called 2663 * once. It performs the following tasks: 2664 * - Initalize all global locks 2665 * - Call sub-initialization routines (localize access to variables) 2666 */ 2667 int 2668 nfs_clntinit(void) 2669 { 2670 #ifdef DEBUG 2671 static boolean_t nfs_clntup = B_FALSE; 2672 #endif 2673 int error; 2674 2675 #ifdef DEBUG 2676 ASSERT(nfs_clntup == B_FALSE); 2677 #endif 2678 2679 error = nfs_subrinit(); 2680 if (error) 2681 return (error); 2682 2683 error = nfs_vfsinit(); 2684 if (error) { 2685 /* 2686 * Cleanup nfs_subrinit() work 2687 */ 2688 nfs_subrfini(); 2689 return (error); 2690 } 2691 zone_key_create(&mi_list_key, nfs_mi_init, nfs_mi_shutdown, 2692 nfs_mi_destroy); 2693 2694 x_READ3args = xdr_READ3args; 2695 x_READ3res = xdr_READ3res; 2696 x_READ3vres = xdr_READ3vres; 2697 x_READ3uiores = xdr_READ3uiores; 2698 2699 nfs4_clnt_init(); 2700 2701 #ifdef DEBUG 2702 nfs_clntup = B_TRUE; 2703 #endif 2704 2705 return (0); 2706 } 2707 2708 /* 2709 * This routine is only called if the NFS Client has been initialized but 2710 * the module failed to be installed. This routine will cleanup the previously 2711 * allocated/initialized work. 2712 */ 2713 void 2714 nfs_clntfini(void) 2715 { 2716 (void) zone_key_delete(mi_list_key); 2717 nfs_subrfini(); 2718 nfs_vfsfini(); 2719 nfs4_clnt_fini(); 2720 } 2721 2722 /* 2723 * nfs_lockrelease: 2724 * 2725 * Release any locks on the given vnode that are held by the current 2726 * process. 2727 */ 2728 void 2729 nfs_lockrelease(vnode_t *vp, int flag, offset_t offset, cred_t *cr) 2730 { 2731 flock64_t ld; 2732 struct shrlock shr; 2733 char *buf; 2734 int remote_lock_possible; 2735 int ret; 2736 2737 ASSERT((uintptr_t)vp > KERNELBASE); 2738 2739 /* 2740 * Generate an explicit unlock operation for the entire file. As a 2741 * partial optimization, only generate the unlock if there is a 2742 * lock registered for the file. We could check whether this 2743 * particular process has any locks on the file, but that would 2744 * require the local locking code to provide yet another query 2745 * routine. Note that no explicit synchronization is needed here. 2746 * At worst, flk_has_remote_locks() will return a false positive, 2747 * in which case the unlock call wastes time but doesn't harm 2748 * correctness. 2749 * 2750 * In addition, an unlock request is generated if the process 2751 * is listed as possibly having a lock on the file because the 2752 * server and client lock managers may have gotten out of sync. 2753 * N.B. It is important to make sure nfs_remove_locking_id() is 2754 * called here even if flk_has_remote_locks(vp) reports true. 2755 * If it is not called and there is an entry on the process id 2756 * list, that entry will never get removed. 2757 */ 2758 remote_lock_possible = nfs_remove_locking_id(vp, RLMPL_PID, 2759 (char *)&(ttoproc(curthread)->p_pid), NULL, NULL); 2760 if (remote_lock_possible || flk_has_remote_locks(vp)) { 2761 ld.l_type = F_UNLCK; /* set to unlock entire file */ 2762 ld.l_whence = 0; /* unlock from start of file */ 2763 ld.l_start = 0; 2764 ld.l_len = 0; /* do entire file */ 2765 ret = VOP_FRLOCK(vp, F_SETLK, &ld, flag, offset, NULL, cr); 2766 2767 if (ret != 0) { 2768 /* 2769 * If VOP_FRLOCK fails, make sure we unregister 2770 * local locks before we continue. 2771 */ 2772 ld.l_pid = ttoproc(curthread)->p_pid; 2773 lm_register_lock_locally(vp, NULL, &ld, flag, offset); 2774 #ifdef DEBUG 2775 nfs_perror(ret, 2776 "NFS lock release error on vp %p: %m.\n", 2777 (void *)vp, NULL); 2778 #endif 2779 } 2780 2781 /* 2782 * The call to VOP_FRLOCK may put the pid back on the 2783 * list. We need to remove it. 2784 */ 2785 (void) nfs_remove_locking_id(vp, RLMPL_PID, 2786 (char *)&(ttoproc(curthread)->p_pid), NULL, NULL); 2787 } 2788 2789 /* 2790 * As long as the vp has a share matching our pid, 2791 * pluck it off and unshare it. There are circumstances in 2792 * which the call to nfs_remove_locking_id() may put the 2793 * owner back on the list, in which case we simply do a 2794 * redundant and harmless unshare. 2795 */ 2796 buf = kmem_alloc(MAX_SHR_OWNER_LEN, KM_SLEEP); 2797 while (nfs_remove_locking_id(vp, RLMPL_OWNER, 2798 (char *)NULL, buf, &shr.s_own_len)) { 2799 shr.s_owner = buf; 2800 shr.s_access = 0; 2801 shr.s_deny = 0; 2802 shr.s_sysid = 0; 2803 shr.s_pid = curproc->p_pid; 2804 2805 ret = VOP_SHRLOCK(vp, F_UNSHARE, &shr, flag, cr); 2806 #ifdef DEBUG 2807 if (ret != 0) { 2808 nfs_perror(ret, 2809 "NFS share release error on vp %p: %m.\n", 2810 (void *)vp, NULL); 2811 } 2812 #endif 2813 } 2814 kmem_free(buf, MAX_SHR_OWNER_LEN); 2815 } 2816 2817 /* 2818 * nfs_lockcompletion: 2819 * 2820 * If the vnode has a lock that makes it unsafe to cache the file, mark it 2821 * as non cachable (set VNOCACHE bit). 2822 */ 2823 2824 void 2825 nfs_lockcompletion(vnode_t *vp, int cmd) 2826 { 2827 #ifdef DEBUG 2828 rnode_t *rp = VTOR(vp); 2829 2830 ASSERT(nfs_rw_lock_held(&rp->r_lkserlock, RW_WRITER)); 2831 #endif 2832 2833 if (cmd == F_SETLK || cmd == F_SETLKW) { 2834 if (!lm_safemap(vp)) { 2835 mutex_enter(&vp->v_lock); 2836 vp->v_flag |= VNOCACHE; 2837 mutex_exit(&vp->v_lock); 2838 } else { 2839 mutex_enter(&vp->v_lock); 2840 vp->v_flag &= ~VNOCACHE; 2841 mutex_exit(&vp->v_lock); 2842 } 2843 } 2844 /* 2845 * The cached attributes of the file are stale after acquiring 2846 * the lock on the file. They were updated when the file was 2847 * opened, but not updated when the lock was acquired. Therefore the 2848 * cached attributes are invalidated after the lock is obtained. 2849 */ 2850 PURGE_ATTRCACHE(vp); 2851 } 2852 2853 /* 2854 * The lock manager holds state making it possible for the client 2855 * and server to be out of sync. For example, if the response from 2856 * the server granting a lock request is lost, the server will think 2857 * the lock is granted and the client will think the lock is lost. 2858 * The client can tell when it is not positive if it is in sync with 2859 * the server. 2860 * 2861 * To deal with this, a list of processes for which the client is 2862 * not sure if the server holds a lock is attached to the rnode. 2863 * When such a process closes the rnode, an unlock request is sent 2864 * to the server to unlock the entire file. 2865 * 2866 * The list is kept as a singularly linked NULL terminated list. 2867 * Because it is only added to under extreme error conditions, the 2868 * list shouldn't get very big. DEBUG kernels print a message if 2869 * the list gets bigger than nfs_lmpl_high_water. This is arbitrarily 2870 * choosen to be 8, but can be tuned at runtime. 2871 */ 2872 #ifdef DEBUG 2873 /* int nfs_lmpl_high_water = 8; */ 2874 int nfs_lmpl_high_water = 128; 2875 int nfs_cnt_add_locking_id = 0; 2876 int nfs_len_add_locking_id = 0; 2877 #endif /* DEBUG */ 2878 2879 /* 2880 * Record that the nfs lock manager server may be holding a lock on 2881 * a vnode for a process. 2882 * 2883 * Because the nfs lock manager server holds state, it is possible 2884 * for the server to get out of sync with the client. This routine is called 2885 * from the client when it is no longer sure if the server is in sync 2886 * with the client. nfs_lockrelease() will then notice this and send 2887 * an unlock request when the file is closed 2888 */ 2889 void 2890 nfs_add_locking_id(vnode_t *vp, pid_t pid, int type, char *id, int len) 2891 { 2892 rnode_t *rp; 2893 lmpl_t *new; 2894 lmpl_t *cur; 2895 lmpl_t **lmplp; 2896 #ifdef DEBUG 2897 int list_len = 1; 2898 #endif /* DEBUG */ 2899 2900 #ifdef DEBUG 2901 ++nfs_cnt_add_locking_id; 2902 #endif /* DEBUG */ 2903 /* 2904 * allocate new lmpl_t now so we don't sleep 2905 * later after grabbing mutexes 2906 */ 2907 ASSERT(len < MAX_SHR_OWNER_LEN); 2908 new = kmem_alloc(sizeof (*new), KM_SLEEP); 2909 new->lmpl_type = type; 2910 new->lmpl_pid = pid; 2911 new->lmpl_owner = kmem_alloc(len, KM_SLEEP); 2912 bcopy(id, new->lmpl_owner, len); 2913 new->lmpl_own_len = len; 2914 new->lmpl_next = (lmpl_t *)NULL; 2915 #ifdef DEBUG 2916 if (type == RLMPL_PID) { 2917 ASSERT(len == sizeof (pid_t)); 2918 ASSERT(pid == *(pid_t *)new->lmpl_owner); 2919 } else { 2920 ASSERT(type == RLMPL_OWNER); 2921 } 2922 #endif 2923 2924 rp = VTOR(vp); 2925 mutex_enter(&rp->r_statelock); 2926 2927 /* 2928 * Add this id to the list for this rnode only if the 2929 * rnode is active and the id is not already there. 2930 */ 2931 ASSERT(rp->r_flags & RHASHED); 2932 lmplp = &(rp->r_lmpl); 2933 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; cur = cur->lmpl_next) { 2934 if (cur->lmpl_pid == pid && 2935 cur->lmpl_type == type && 2936 cur->lmpl_own_len == len && 2937 bcmp(cur->lmpl_owner, new->lmpl_owner, len) == 0) { 2938 kmem_free(new->lmpl_owner, len); 2939 kmem_free(new, sizeof (*new)); 2940 break; 2941 } 2942 lmplp = &cur->lmpl_next; 2943 #ifdef DEBUG 2944 ++list_len; 2945 #endif /* DEBUG */ 2946 } 2947 if (cur == (lmpl_t *)NULL) { 2948 *lmplp = new; 2949 #ifdef DEBUG 2950 if (list_len > nfs_len_add_locking_id) { 2951 nfs_len_add_locking_id = list_len; 2952 } 2953 if (list_len > nfs_lmpl_high_water) { 2954 cmn_err(CE_WARN, "nfs_add_locking_id: long list " 2955 "vp=%p is %d", (void *)vp, list_len); 2956 } 2957 #endif /* DEBUG */ 2958 } 2959 2960 #ifdef DEBUG 2961 if (share_debug) { 2962 int nitems = 0; 2963 int npids = 0; 2964 int nowners = 0; 2965 2966 /* 2967 * Count the number of things left on r_lmpl after the remove. 2968 */ 2969 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; 2970 cur = cur->lmpl_next) { 2971 nitems++; 2972 if (cur->lmpl_type == RLMPL_PID) { 2973 npids++; 2974 } else if (cur->lmpl_type == RLMPL_OWNER) { 2975 nowners++; 2976 } else { 2977 cmn_err(CE_PANIC, "nfs_add_locking_id: " 2978 "unrecognised lmpl_type %d", 2979 cur->lmpl_type); 2980 } 2981 } 2982 2983 cmn_err(CE_CONT, "nfs_add_locking_id(%s): %d PIDs + %d " 2984 "OWNs = %d items left on r_lmpl\n", 2985 (type == RLMPL_PID) ? "P" : "O", npids, nowners, nitems); 2986 } 2987 #endif 2988 2989 mutex_exit(&rp->r_statelock); 2990 } 2991 2992 /* 2993 * Remove an id from the lock manager id list. 2994 * 2995 * If the id is not in the list return 0. If it was found and 2996 * removed, return 1. 2997 */ 2998 static int 2999 nfs_remove_locking_id(vnode_t *vp, int type, char *id, char *rid, int *rlen) 3000 { 3001 lmpl_t *cur; 3002 lmpl_t **lmplp; 3003 rnode_t *rp; 3004 int rv = 0; 3005 3006 ASSERT(type == RLMPL_PID || type == RLMPL_OWNER); 3007 3008 rp = VTOR(vp); 3009 3010 mutex_enter(&rp->r_statelock); 3011 ASSERT(rp->r_flags & RHASHED); 3012 lmplp = &(rp->r_lmpl); 3013 3014 /* 3015 * Search through the list and remove the entry for this id 3016 * if it is there. The special case id == NULL allows removal 3017 * of the first share on the r_lmpl list belonging to the 3018 * current process (if any), without regard to further details 3019 * of its identity. 3020 */ 3021 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; cur = cur->lmpl_next) { 3022 if (cur->lmpl_type == type && 3023 cur->lmpl_pid == curproc->p_pid && 3024 (id == (char *)NULL || 3025 bcmp(cur->lmpl_owner, id, cur->lmpl_own_len) == 0)) { 3026 *lmplp = cur->lmpl_next; 3027 ASSERT(cur->lmpl_own_len < MAX_SHR_OWNER_LEN); 3028 if (rid != NULL) { 3029 bcopy(cur->lmpl_owner, rid, cur->lmpl_own_len); 3030 *rlen = cur->lmpl_own_len; 3031 } 3032 kmem_free(cur->lmpl_owner, cur->lmpl_own_len); 3033 kmem_free(cur, sizeof (*cur)); 3034 rv = 1; 3035 break; 3036 } 3037 lmplp = &cur->lmpl_next; 3038 } 3039 3040 #ifdef DEBUG 3041 if (share_debug) { 3042 int nitems = 0; 3043 int npids = 0; 3044 int nowners = 0; 3045 3046 /* 3047 * Count the number of things left on r_lmpl after the remove. 3048 */ 3049 for (cur = rp->r_lmpl; cur != (lmpl_t *)NULL; 3050 cur = cur->lmpl_next) { 3051 nitems++; 3052 if (cur->lmpl_type == RLMPL_PID) { 3053 npids++; 3054 } else if (cur->lmpl_type == RLMPL_OWNER) { 3055 nowners++; 3056 } else { 3057 cmn_err(CE_PANIC, 3058 "nrli: unrecognised lmpl_type %d", 3059 cur->lmpl_type); 3060 } 3061 } 3062 3063 cmn_err(CE_CONT, 3064 "nrli(%s): %d PIDs + %d OWNs = %d items left on r_lmpl\n", 3065 (type == RLMPL_PID) ? "P" : "O", 3066 npids, 3067 nowners, 3068 nitems); 3069 } 3070 #endif 3071 3072 mutex_exit(&rp->r_statelock); 3073 return (rv); 3074 } 3075 3076 void 3077 nfs_free_mi(mntinfo_t *mi) 3078 { 3079 ASSERT(mi->mi_flags & MI_ASYNC_MGR_STOP); 3080 ASSERT(mi->mi_manager_thread == NULL); 3081 ASSERT(mi->mi_threads == 0); 3082 3083 /* 3084 * Remove the node from the global list before we start tearing it down. 3085 */ 3086 nfs_mi_zonelist_remove(mi); 3087 if (mi->mi_klmconfig) { 3088 lm_free_config(mi->mi_klmconfig); 3089 kmem_free(mi->mi_klmconfig, sizeof (struct knetconfig)); 3090 } 3091 mutex_destroy(&mi->mi_lock); 3092 mutex_destroy(&mi->mi_remap_lock); 3093 mutex_destroy(&mi->mi_async_lock); 3094 cv_destroy(&mi->mi_failover_cv); 3095 cv_destroy(&mi->mi_async_work_cv); 3096 cv_destroy(&mi->mi_async_reqs_cv); 3097 cv_destroy(&mi->mi_async_cv); 3098 zone_rele(mi->mi_zone); 3099 kmem_free(mi, sizeof (*mi)); 3100 } 3101 3102 static int 3103 mnt_kstat_update(kstat_t *ksp, int rw) 3104 { 3105 mntinfo_t *mi; 3106 struct mntinfo_kstat *mik; 3107 vfs_t *vfsp; 3108 int i; 3109 3110 /* this is a read-only kstat. Bail out on a write */ 3111 if (rw == KSTAT_WRITE) 3112 return (EACCES); 3113 3114 /* 3115 * We don't want to wait here as kstat_chain_lock could be held by 3116 * dounmount(). dounmount() takes vfs_reflock before the chain lock 3117 * and thus could lead to a deadlock. 3118 */ 3119 vfsp = (struct vfs *)ksp->ks_private; 3120 3121 3122 mi = VFTOMI(vfsp); 3123 3124 mik = (struct mntinfo_kstat *)ksp->ks_data; 3125 3126 (void) strcpy(mik->mik_proto, mi->mi_curr_serv->sv_knconf->knc_proto); 3127 mik->mik_vers = (uint32_t)mi->mi_vers; 3128 mik->mik_flags = mi->mi_flags; 3129 mik->mik_secmod = mi->mi_curr_serv->sv_secdata->secmod; 3130 mik->mik_curread = (uint32_t)mi->mi_curread; 3131 mik->mik_curwrite = (uint32_t)mi->mi_curwrite; 3132 mik->mik_retrans = mi->mi_retrans; 3133 mik->mik_timeo = mi->mi_timeo; 3134 mik->mik_acregmin = HR2SEC(mi->mi_acregmin); 3135 mik->mik_acregmax = HR2SEC(mi->mi_acregmax); 3136 mik->mik_acdirmin = HR2SEC(mi->mi_acdirmin); 3137 mik->mik_acdirmax = HR2SEC(mi->mi_acdirmax); 3138 for (i = 0; i < NFS_CALLTYPES + 1; i++) { 3139 mik->mik_timers[i].srtt = (uint32_t)mi->mi_timers[i].rt_srtt; 3140 mik->mik_timers[i].deviate = 3141 (uint32_t)mi->mi_timers[i].rt_deviate; 3142 mik->mik_timers[i].rtxcur = 3143 (uint32_t)mi->mi_timers[i].rt_rtxcur; 3144 } 3145 mik->mik_noresponse = (uint32_t)mi->mi_noresponse; 3146 mik->mik_failover = (uint32_t)mi->mi_failover; 3147 mik->mik_remap = (uint32_t)mi->mi_remap; 3148 (void) strcpy(mik->mik_curserver, mi->mi_curr_serv->sv_hostname); 3149 3150 return (0); 3151 } 3152 3153 void 3154 nfs_mnt_kstat_init(struct vfs *vfsp) 3155 { 3156 mntinfo_t *mi = VFTOMI(vfsp); 3157 3158 /* 3159 * Create the version specific kstats. 3160 * 3161 * PSARC 2001/697 Contract Private Interface 3162 * All nfs kstats are under SunMC contract 3163 * Please refer to the PSARC listed above and contact 3164 * SunMC before making any changes! 3165 * 3166 * Changes must be reviewed by Solaris File Sharing 3167 * Changes must be communicated to contract-2001-697@sun.com 3168 * 3169 */ 3170 3171 mi->mi_io_kstats = kstat_create_zone("nfs", getminor(vfsp->vfs_dev), 3172 NULL, "nfs", KSTAT_TYPE_IO, 1, 0, mi->mi_zone->zone_id); 3173 if (mi->mi_io_kstats) { 3174 if (mi->mi_zone->zone_id != GLOBAL_ZONEID) 3175 kstat_zone_add(mi->mi_io_kstats, GLOBAL_ZONEID); 3176 mi->mi_io_kstats->ks_lock = &mi->mi_lock; 3177 kstat_install(mi->mi_io_kstats); 3178 } 3179 3180 if ((mi->mi_ro_kstats = kstat_create_zone("nfs", 3181 getminor(vfsp->vfs_dev), "mntinfo", "misc", KSTAT_TYPE_RAW, 3182 sizeof (struct mntinfo_kstat), 0, mi->mi_zone->zone_id)) != NULL) { 3183 if (mi->mi_zone->zone_id != GLOBAL_ZONEID) 3184 kstat_zone_add(mi->mi_ro_kstats, GLOBAL_ZONEID); 3185 mi->mi_ro_kstats->ks_update = mnt_kstat_update; 3186 mi->mi_ro_kstats->ks_private = (void *)vfsp; 3187 kstat_install(mi->mi_ro_kstats); 3188 } 3189 } 3190 3191 nfs_delmapcall_t * 3192 nfs_init_delmapcall() 3193 { 3194 nfs_delmapcall_t *delmap_call; 3195 3196 delmap_call = kmem_alloc(sizeof (nfs_delmapcall_t), KM_SLEEP); 3197 delmap_call->call_id = curthread; 3198 delmap_call->error = 0; 3199 3200 return (delmap_call); 3201 } 3202 3203 void 3204 nfs_free_delmapcall(nfs_delmapcall_t *delmap_call) 3205 { 3206 kmem_free(delmap_call, sizeof (nfs_delmapcall_t)); 3207 } 3208 3209 /* 3210 * Searches for the current delmap caller (based on curthread) in the list of 3211 * callers. If it is found, we remove it and free the delmap caller. 3212 * Returns: 3213 * 0 if the caller wasn't found 3214 * 1 if the caller was found, removed and freed. *errp is set to what 3215 * the result of the delmap was. 3216 */ 3217 int 3218 nfs_find_and_delete_delmapcall(rnode_t *rp, int *errp) 3219 { 3220 nfs_delmapcall_t *delmap_call; 3221 3222 /* 3223 * If the list doesn't exist yet, we create it and return 3224 * that the caller wasn't found. No list = no callers. 3225 */ 3226 mutex_enter(&rp->r_statelock); 3227 if (!(rp->r_flags & RDELMAPLIST)) { 3228 /* The list does not exist */ 3229 list_create(&rp->r_indelmap, sizeof (nfs_delmapcall_t), 3230 offsetof(nfs_delmapcall_t, call_node)); 3231 rp->r_flags |= RDELMAPLIST; 3232 mutex_exit(&rp->r_statelock); 3233 return (0); 3234 } else { 3235 /* The list exists so search it */ 3236 for (delmap_call = list_head(&rp->r_indelmap); 3237 delmap_call != NULL; 3238 delmap_call = list_next(&rp->r_indelmap, delmap_call)) { 3239 if (delmap_call->call_id == curthread) { 3240 /* current caller is in the list */ 3241 *errp = delmap_call->error; 3242 list_remove(&rp->r_indelmap, delmap_call); 3243 mutex_exit(&rp->r_statelock); 3244 nfs_free_delmapcall(delmap_call); 3245 return (1); 3246 } 3247 } 3248 } 3249 mutex_exit(&rp->r_statelock); 3250 return (0); 3251 }--- EOF ---