MFSLives.c

/*
    File:       MFSLives.c
 
    Contains:   A VFS plug-in example for MFS volumes (original 400 KB floppies).
 
    Written by: DTS
 
    Copyright:  Copyright (c) 2006 by Apple Computer, Inc., All Rights Reserved.
 
    Disclaimer: IMPORTANT:  This Apple software is supplied to you by Apple Computer, Inc.
                ("Apple") in consideration of your agreement to the following terms, and your
                use, installation, modification or redistribution of this Apple software
                constitutes acceptance of these terms.  If you do not agree with these terms,
                please do not use, install, modify or redistribute this Apple software.
 
                In consideration of your agreement to abide by the following terms, and subject
                to these terms, Apple grants you a personal, non-exclusive license, under Apple's
                copyrights in this original Apple software (the "Apple Software"), to use,
                reproduce, modify and redistribute the Apple Software, with or without
                modifications, in source and/or binary forms; provided that if you redistribute
                the Apple Software in its entirety and without modifications, you must retain
                this notice and the following text and disclaimers in all such redistributions of
                the Apple Software.  Neither the name, trademarks, service marks or logos of
                Apple Computer, Inc. may be used to endorse or promote products derived from the
                Apple Software without specific prior written permission from Apple.  Except as
                expressly stated in this notice, no other rights or licenses, express or implied,
                are granted by Apple herein, including but not limited to any patent rights that
                may be infringed by your derivative works or by other works in which the Apple
                Software may be incorporated.
 
                The Apple Software is provided by Apple on an "AS IS" basis.  APPLE MAKES NO
                WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
                WARRANTIES OF NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A PARTICULAR
                PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION ALONE OR IN
                COMBINATION WITH YOUR PRODUCTS.
 
                IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
                CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
                GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
                ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION AND/OR DISTRIBUTION
                OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER UNDER THEORY OF CONTRACT, TORT
                (INCLUDING NEGLIGENCE), STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS BEEN
                ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
    Change History (most recent first):
 
$Log: MFSLives.c,v $
Revision 1.3  2006/10/31 16:30:03  eskimo1
Updated some comments based on review feedback.
 
Revision 1.2  2006/10/09 13:11:41  eskimo1
Rewrite VNOPBlockmap to document and adopt pre- and post-conditions from kernel engineering.
 
Revision 1.1  2006/07/27 15:47:55  eskimo1
First checked in.
 
 
 
*/
 
/////////////////////////////////////////////////////////////////////
 
// Our helper modules
 
#include "MFSCore.h"
#include "HashNode.h"
#include "MFSLivesMountArgs.h"
 
// System interfaces
 
#include <kern/assert.h>
#include <libkern/libkern.h>
#include <libkern/OSMalloc.h>
#include <libkern/locks.h>
#include <mach/mach_types.h>
#include <sys/dirent.h>
#include <sys/disk.h>
#include <sys/errno.h>
#include <sys/fcntl.h>
#include <sys/kernel_types.h>
#include <sys/mman.h>
#include <sys/mount.h>
#include <sys/proc.h>
#include <sys/stat.h>
#include <sys/ubc.h>
#include <sys/unistd.h>
#include <sys/vnode.h>
#include <sys/vnode_if.h>
#include <sys/xattr.h>
 
/////////////////////////////////////////////////////////////////////
#pragma mark ***** Source Code Notes
 
/*
    Bit Fields
    ----------
    In places where I initialise a bit field, I include both the active bits 
    and the inactive bits (commented out).  This lets you quickly see all of 
    the options that are available and the options that I've specifically enabled.
    
    Terminology
    -----------
    Each volume is made up of a set of file system objects (fsobjs).  These objects 
    are stored on disk (or in some other way, such as across the network).  To speed 
    things up, the system caches information about these file system objects in 
    memory.  The objects in this cache are called vnodes.  The cache is managed by 
    the VFS layer and the VFS plug-in, working in concert.
    
    This cache is /not/ the disk cache (in the traditional sense of the phrase). 
    A disk cache typically caches the contents of blocks on the disk.  Here we're 
    referring to a cache of information about the file system objects on the volume.
    
    Mac OS X does have a disk cache (called the Unified Buffer Cache, UBC), and this 
    example interacts with it when it needs to read directory blocks (using the 
    buf_meta_bread call) and when it reads files (using the cluster_read and 
    cluster_pagein calls).
    
    A vnode is a virtual representation of a file system object.  It's virtual in 
    the sense that it has no information about the concrete implementation of the 
    object on disk (or across the network).  Rather, it's the handle which the 
    higher levels of the system use to learn about and manipulate a given file 
    system object.  The only concrete information about the file system object 
    that stored in the vnode is a reference to the corresponding FSNode.
    
    An FSNode is the in-memory representation of a file system object.  An FSNode 
    is managed by the VFS plug-in, and contains all of the concrete information 
    needed to manage that file system object.  For example, on HFS Plus the FSNode 
    would store the CNID of the file system object.
 
    We don't use "inode" at all, for two reasons:
    
      o Traditionally, the term "inode" has been used to describe both the 
        on-disk representation of a file system object /and/ the 
        in-memory representation of that object (if it's being cached in memory).
        That's just confusing (-:
      
      o The term "inode" implies a certain style of on-disk organisation, which is 
        not universally applicable (for an obvious example, consider a network 
        file system), and is certainly not applicable to MFS.
    
    Traditionally there is a one-to-one correspondence between vnodes and FSNodes. 
    However, this not true in the presence of multi-fork files, where there is 
    one vnode for each fork but all of these refer to the same FSNode.
        
    FSNode Hash
    -----------
    It's important to realise that the vnode cache is managed globally by the 
    VFS layer.  The VFS plug-in is expected to following along with decisions 
    made by the VFS layer.  However, vnodes are created by the VFS plug-ins, 
    as they respond to incoming requests.
    
    The most common situation where a VFS plug-in needs to create a vnode is 
    in VNOPLookup.  In this case, the plug-in has information about the file 
    system object in question (in this example, we have the file number) and 
    needs to create a vnode for to return as the result of the lookup.  
    The critical point is that the VFS plug-in MUST NOT create two vnodes 
    for the same file.  Therefore the plug-in must maintain some data structure 
    that:
    
      o can be accessed quickly based on the information in the file system 
        object's directory entry (that is, the file number)
    
      o tells the VFS plug-in which file system objects are currently in memory 
      
      o can return the vnode, if any, associated with that FSNode
    
    This is typically done using a hash table that indexes all of the FSNodes. 
    This is keyed by the file system object's raw device number (dev_t) and 
    inode number (file number in the case of MFS).  Getting the mechanics of 
    this table right is the most difficult part of implementing a VFS plug-in.
    
    In the case of MFSLives, I've moved all of this complexity into a reusable 
    module.  See "HashNode.h" and "HashNode.c" for the details.  There's lots 
    of cool comments in "HashNode.h".
    
    MFS Core
    --------
    I've put all of the code that actually interprets MFS data structures into 
    a separate module.  See "MFSCore.h" and "MFSCore.c" for the details, 
    including an explanation of /why/ I did this.
*/
 
/////////////////////////////////////////////////////////////////////
#pragma mark ***** More Asserts
 
// We use the system assert macro (from <kern/assert.h>) for standard asserts.  
// In some cases we also want to assert that an incoming 'flags' parameter 
// has only the bits that we know about set.  In this case we use the 
// AssertKnownFlags macro.  As getting an unknown flag is more of a warning 
// than an error, we just print a message and continue execution.  And, to 
// avoid a flood of junk in the system log, we only print a given message once.
 
#if MACH_ASSERT
 
    static void AssertKnownFlagsCore(
        uint64_t        flags, 
        uint64_t        knownFlags, 
        boolean_t *     havePrintedPtr,
        const char *    fileStr,
        int             lineNumber,
        const char *    flagsStr,
        const char *    knownFlagsStr
    )
        // Core implementation of AssertKnownFlags.
    {
        // Check to see if we have any unknown flags.
        
        if ( (flags & ~knownFlags) != 0 ) {
 
            // If so, have we already printed a warning.
            
            if ( (havePrintedPtr == NULL) || ! *havePrintedPtr ) {
 
                // If not, print it.
                
                printf("%s:%d: AssertKnownFlags(%s, %s) saw unknown flags 0x%llx.\n",
                    fileStr,
                    lineNumber,
                    flagsStr,
                    knownFlagsStr,
                    flags & ~knownFlags
                );
            }
            
            // And record that we did.
            
            if (havePrintedPtr != NULL) {
                *havePrintedPtr = TRUE;
            }
        }
    }
 
    // In AssertKnownFlags macro, flags is the incoming flags and 
    // knownFlags is the set of all flags that we knew about when we 
    // wrote the code.
 
    #define AssertKnownFlags(flags, knownFlags) \
        do {                                    \
            static boolean_t sHavePrinted;      \
            AssertKnownFlagsCore((flags), (knownFlags), &sHavePrinted, __FILE__, __LINE__, # flags, # knownFlags); \
        } while (0)
 
#else
 
    #define AssertKnownFlags(flags, knownFlags) do { } while (0)
 
#endif
 
/////////////////////////////////////////////////////////////////////
#pragma mark ***** Error Conversion
 
static errno_t ErrnoFromKernReturn(kern_return_t kernErr)
    // Maps a kern_return_t-style error into an errno_t-style error.
{
    errno_t err;
 
    if (kernErr == KERN_SUCCESS) {
        err = 0;
    } else {
        err = EINVAL;
    }
    return err;
}
 
static kern_return_t KernReturnFromErrno(errno_t err)
    // Maps an errno_t-style error into a kern_return_t-style error.
{
    kern_return_t kernErr;
    
    if (err == 0) {
        kernErr = KERN_SUCCESS;
    } else {
        kernErr = KERN_FAILURE;
    }
    return err;    
}
 
/////////////////////////////////////////////////////////////////////
#pragma mark ***** Memory and Locks
 
// gOSMallocTag is used for all of our allocations.
 
static OSMallocTag  gOSMallocTag = NULL;
 
// gLockGroup is used for all of our locks.
 
static lck_grp_t *  gLockGroup = NULL;
 
static void TermMemoryAndLocks(void)
    // Disposes of gOSMallocTag and gLockGroup.
{
    if (gLockGroup != NULL) {
        lck_grp_free(gLockGroup);
        gLockGroup = NULL;
    }
    if (gOSMallocTag != NULL) {
        OSMalloc_Tagfree(gOSMallocTag);
        gOSMallocTag = NULL;
    }
}
 
static kern_return_t InitMemoryAndLocks(void)
    // Initialises of gOSMallocTag and gLockGroup.
{ 
    kern_return_t   err;
    
    err = KERN_SUCCESS;
    gOSMallocTag = OSMalloc_Tagalloc("com.apple.dts.kext.MFSLives", OSMT_DEFAULT);
    if (gOSMallocTag == NULL) {
        err = KERN_FAILURE;
    }
    if (err == KERN_SUCCESS) {
        gLockGroup = lck_grp_alloc_init("com.apple.dts.kext.MFSLives", LCK_GRP_ATTR_NULL);
        if (gLockGroup == NULL) {
            err = KERN_FAILURE;
        }
    }
    
    // Clean up.
    
    if (err != KERN_SUCCESS) {
        TermMemoryAndLocks();
    }
    
    assert( (err == KERN_SUCCESS) == (gOSMallocTag != NULL) );
    assert( (err == KERN_SUCCESS) == (gLockGroup   != NULL) );
    
    return err;
}
 
/////////////////////////////////////////////////////////////////////
#pragma mark ***** Core Data Structures
 
// gVNodeOperations is set up when we register the VFS plug-in with vfs_fsadd. 
// It holds a pointer to the array of vnode operation functions for this 
// VFS plug-in.  We have to declare it early in this file because it's referenced 
// by the code that creates vnodes.
 
static errno_t (**gVNodeOperations)(void *);
 
#pragma mark - FSMount
 
// FSMount holds the file system specific data that we need per mount point.
// We attach this to the kernel mount_t by calling vfs_setfsprivate in VFSOPMount. 
// There is no reference count on this structure; it lives and dies along with the 
// corresponding mount_t.
 
enum {
    kFSMountMagic    = 'MFMn',
    kFSMountBadMagic = 'M!Mn'
};
 
struct FSMount {
    uint32_t        fMagic;                     // [1] must be kFSMountMagic
    boolean_t       fForceMount;                // [1] copied from MFSLivesMountArgs; see "MFSLivesMountArgs.h" for details
    boolean_t       fForceFailure;              // [1] copied from MFSLivesMountArgs; see "MFSLivesMountArgs.h" for details
    mount_t         fMountPoint;                // [1] back pointer to the mount_t
    dev_t           fBlockRDevNum;              // [1] raw dev_t of the device we're mounted on
    vnode_t         fBlockDevVNode;             // [1] a vnode for the above; we have a use count reference on this
    size_t          fBlockDevBlockSize;         // [1] block size, in bytes, for the above
    uint64_t        fBlockDevBlockCount;        // [1] block count for the above
    
    // The next group of values are all obtained from the MFS core when we 
    // call MFSMDBCheck.  They contain all of the information that we need to 
    // interpret the MFS volume (in concert the with the routines exported by 
    // the MFS core).
    
    size_t          fMDBAndVABMSizeInBytes;     // [1] size of combined MDB and VABM, rounded up to the next block size
    uint16_t        fDirectoryStartBlock;       // [1] first block of the directory
    uint16_t        fDirectoryBlockCount;       // [1] number of blocks in the directory
    uint16_t        fAllocationBlocksStartBlock;// [1] block number that holds the first allocation block
    uint32_t        fAllocationBlockSizeInBytes;// [1] allocate block size in bytes
 
    void *          fMDBVABM;                   // [1] a pointer to a buffer that holds the MDB/VABM;
                                                //     its size is fMDBAndVABMSizeInBytes
};
typedef struct FSMount FSMount;
 
// FSMount Notes
// -------------
// [1] This field is immutable.  That is, it's set up as part of the initialisation 
//     process, and is not modified after that.  Thus, it doesn't need to be 
//     protected from concurrent access.  Yay for read-only file systems!
 
static FSMount *   FSMountFromMount(mount_t mp)
    // Gets the FSMount from a mount_t, with appropriate runtime checks in the 
    // debug version.
{
    FSMount *  result;
    
    assert(mp != NULL);
    
    result = vfs_fsprivate(mp);
 
    assert(result != NULL);
    assert(result->fMagic == kFSMountMagic);
    assert(result->fMountPoint == mp);
    
    return result;
}
 
#if MACH_ASSERT
 
    static boolean_t ValidFSMount(FSMount *fsmp)
    {
        return (fsmp != NULL) && (fsmp->fMagic == kFSMountMagic);
    }
 
#endif
 
#pragma mark - FSNode
 
// FSNode holds the file system specific data that we need per vnode.  We attach this 
// to the kernel vnode_t when we create a vnode (see the calls to vnode_create below). 
// There is no reference count on this structure; its lifetime is controlled by the 
// HNode that it's associated with.  That's a complex topic that's discussed in detail 
// in the comments in "HashNode.h".
 
enum {
    kFSNodeMagic    = 'MFFn',
    kFSNodeBadMagic = 'M!Fn',
    
    kHNodeMagic     = 'MFHn'
};
 
struct FSNode {
    uint32_t        fMagic;             // [1] must be kFSNodeMagic
    boolean_t       fInitialised;       // [1] true if the FSNode has been initialised
    uint16_t        fDirBlock;          // [1] block number of the file's directory entry; 0 for the root directory FSNode
    size_t          fDirOffset;         // [1] offset of the file's directory entry; 0 for the root directory vnode
    MFSForkInfo     fForkInfo[2];       // [1] data (index 0) and rsrc (index 1) fork info; see the discussion 
                                        //     of MFSForkInfo in "MFSCore.h"; all zeros for the root directory FSNode
    uint32_t        fLastDirOffset;     // [2] cache of last valid uio_offset (for directories)
};
typedef struct FSNode FSNode;
 
// FSNode Notes
// -------------
// [1] This field is immutable.  That is, it's set up when the vnode is created, 
//     and is not modified after that.  Thus, it doesn't need to be protected 
//     from concurrent access.  Yay for read-only file systems!
//
// [2] This is a uint32_t because that we can be sure that all reads and writes 
//     are atomic.  We need this because, if two concurrent threads are reading 
//     through the root directory, they might end up trying to access this field 
//     simultaneously.  As long as those accesses are atomic, we're OK even 
//     without a lock; the value will always be consistent (even though the threads 
//     will 'blow' each other's cache).  If the accesse are not atomic (for example, 
//     if this field was an off_t (which is 64 bits, which is non-atomic when accessed 
//     by 32-bit code), you might get half of value A and half of value B, which would 
//     be bad (What do you mean, "bad?" / Try to imagine...).
//
//     This is a long winded way of saying that we can avoid creating a mutex per 
//     FSNode by keeping this field small (-:
 
static FSNode * FSNodeFromVNode(vnode_t vn)
    // A version of FSNodeGenericFromVNode that casts the result to the 
    // correct type (and does more runtime checks).
{
    FSNode *    result;
    
    result = (FSNode *) FSNodeGenericFromVNode(vn);
    assert( (result != NULL) && (result->fMagic == kFSNodeMagic) );
    assert(result->fInitialised);
    
    return result;
}
 
static FSNode * FSNodeFromHNode(HNodeRef hn)
    // A version of FSNodeGenericFromHNode that casts the result to the 
    // correct type (and does more runtime checks).
{
    FSNode *    result;
    
    result = (FSNode *) FSNodeGenericFromHNode(hn);
    assert( (result != NULL) && (result->fMagic == kFSNodeMagic) );
    assert(result->fInitialised);
    
    return result;
}
 
static FSNode * FSNodeFromHNodeUnchecked(HNodeRef hn)
    // A version of FSNodeGenericFromHNode that casts the result to the 
    // correct type but does not check that the FSNode is valid.  
    // This is used after calling HNodeLookupCreatingIfNecessary, because 
    // the FSNode could be newly created, and thus not have the correct 
    // magic.
{
    FSNode *    result;
    
    result = (FSNode *) FSNodeGenericFromHNode(hn);
    assert(result != NULL);
    
    return result;
}
 
#if MACH_ASSERT
 
    static boolean_t ValidFSNode(FSNode *fsn)
    {
        assert( (fsn != NULL) && (fsn->fMagic == kFSNodeMagic) );
        assert(fsn->fInitialised);
        
        return TRUE;
    }
 
    static boolean_t ValidVNode(vnode_t vn)
        // Returns true if the vnode is valid on our file system.  
    {
        FSMount *   fsmp;
        FSNode *    fsn;
        
        assert(vn != NULL);
        
        fsmp = FSMountFromMount( vnode_mount(vn) );     // FSMountFromMount has its own assertions
        
        fsn = FSNodeFromVNode(vn);                      // FSNodeFromVNode has its own assertions
        
        if (vnode_isdir(vn)) {
            assert(fsn->fDirBlock  == 0);
            assert(fsn->fDirOffset == 0);
        } else if (vnode_isreg(vn)) {
            assert((fsn->fDirBlock >= fsmp->fDirectoryStartBlock) && (fsn->fDirBlock < (fsmp->fDirectoryStartBlock + fsmp->fDirectoryBlockCount)));
            assert(fsn->fDirOffset < fsmp->fBlockDevBlockSize);
        } else {
            assert(FALSE);
        }
        
        return (fsmp != NULL);
    }
 
#endif
 
static void FSNodeScrub(FSNode * fsn)
    // This routine is called to clean out an FSNode prior to its memory 
    // being deallocated.  The implementation does not have to worry 
    // about race conditions; it is the only thread that could be accessing 
    // the FSNode at this time.
    //
    // For MFSLives, there is no scrubbable data in the FSNode, so we don't do 
    // much.
{
    fsn->fMagic = kFSNodeBadMagic;
}
 
/////////////////////////////////////////////////////////////////////
#pragma mark ***** Core Algorithms
 
// It may seem like there's a lot of redundancy in the following routines 
// (FSNodeGetOrCreateRootVNode, FSNodeGetOrCreateFileVNodeByName, 
// FSNodeGetOrCreateFileVNodeByID, and their associated subroutines), but 
// it wasn't obvious how to refactor them to reduce the redundancy without 
// complicating the code excessively.  As one goal of this sample is to keep 
// things simple, I decided to prefer a lot of simple code over a small 
// amount of complex code.
 
static errno_t FSNodeGetOrCreateRootVNode(FSMount *fsmp, vnode_t *vnPtr)
    // Gets the root vnode for the file system, or creates one if none 
    // exists.  This is the core of VFSOPRoot.
    //
    // fsmp must point to a valid FSMount.
    //
    // vnPtr must not be NULL.  On error, *vnPtr will be NULL.  On success, 
    // *vnPtr will be a vnode with an I/O reference that the caller is 
    // responsible for releasing.
    //
    // The overall structure of this routine is dictated by the architecture 
    // of the hash layer; see the comments in "HashNode.h" for details.
{
    int             err;
    vnode_t         vn;
    HNodeRef        hn;
    FSNode *        fsn;
    
    assert(ValidFSMount(fsmp));
    assert( vnPtr != NULL);
    assert(*vnPtr == NULL);
    
    hn = NULL;
    vn = NULL;
 
    err = HNodeLookupCreatingIfNecessary(fsmp->fBlockRDevNum, kMFSRootInodeNumber, 0, &hn, &vn);
    
    if ( (err == 0) && (vn == NULL) ) {
        struct vnode_fsparam    params;
 
        fsn = FSNodeFromHNodeUnchecked(hn);
        
        // If this is a new FSNode, initialise it.
        
        if ( ! fsn->fInitialised ) {
            fsn->fMagic       = kFSNodeMagic;
            fsn->fInitialised = TRUE;
            // For the root directory, all other fields can stay zero.
        }
 
        // Try to create the vnode.
        
        params.vnfs_mp         = fsmp->fMountPoint;
        params.vnfs_vtype      = VDIR;
        params.vnfs_str        = NULL;
        params.vnfs_dvp        = NULL;
        params.vnfs_fsnode     = hn;
        params.vnfs_vops       = gVNodeOperations;
        params.vnfs_markroot   = TRUE;
        params.vnfs_marksystem = FALSE;
        params.vnfs_rdev       = 0;                 // we don't currently support VBLK or VCHR
        params.vnfs_filesize   = 0;                 // not relevant for a directory
 
        // Name caching is completely disabled until I can work through all of the issues.  
        // Specifically, HFS Plus won't cache a precomposed name, and I think I should 
        // do the same.
 
        params.vnfs_cnp        = NULL;
        params.vnfs_flags      = VNFS_NOCACHE | VNFS_CANTCACHE;
        
        err = vnode_create(VNCREATE_FLAVOR, sizeof(params), &params, &vn);
        
        assert( (err == 0) == (vn != NULL) );
        
        // Complete our contract with the hash layer.
        
        if (err == 0) {
            HNodeAttachVNodeSucceeded(hn, 0, vn);
        } else {
            if ( HNodeAttachVNodeFailed(hn, 0) ) {
                FSNodeScrub(fsn);
                HNodeScrubDone(hn);
            }
        }
    }
 
    if (err == 0) {
        *vnPtr = vn;
    }
 
    assert( (err == 0) == (*vnPtr != NULL) );
    
    return err;
}
 
static errno_t CheckForForkSpecifier(struct componentname *cn, size_t *forkIndexPtr)
    // This routine checks to see if the path component /after/ the current path 
    // component in cn is a fork specifier.  If so, it stores the appropriate 
    // fork index (0 for data, 1 for rsrc) in *forkIndexPtr.
    //
    // This routine is used by FSNodeGetOrCreateFileVNodeByName to see if the user 
    // is trying to open a specific fork.
    //
    // There are a bunch of possible results:
    //
    // o If the current path component in cn is the last component, the routine 
    //   does nothing (leaving *forkIndexPtr as 0) and returns 0.
    // o If the current path component in cn is not the last component and the 
    //   next path component is a fork specifier, it consumes that component.
    //   Furthermore:
    //     
    //     o If cn indicates that the lookup was for LOOKUP or CREATE, the 
    //       function returns 0.
    //     o If cn indicates that the lookup was for DELETE or RENAME, the 
    //       function returns 0.
{
    int             err;
    const char *    suffix;
    static const char kDataForkSpecifier[] = "/..namedfork/data";
    static const char kRsrcForkSpecifier[] = "/..namedfork/rsrc";
    
    assert(cn != NULL);
    assert( forkIndexPtr != NULL);
    assert(*forkIndexPtr == 0);
    
    // If there's another component after this one (which would be kinda weird given that 
    // we're a flat file system), look to see if it's a valid fork specifier.
 
    err = 0;
    if ( !(cn->cn_flags & ISLASTCN) ) {
        suffix = cn->cn_nameptr + cn->cn_namelen;
        assert(*suffix == '/');
 
        // This is potentially bogus because I can't guarantee that memory pointed to 
        // by suffix is valid.  But this is more-or-less how HFS does it.
 
        if (strncmp(suffix, kDataForkSpecifier, strlen(kDataForkSpecifier)) == 0) {
            assert(*forkIndexPtr == 0);
            cn->cn_consume = strlen(kDataForkSpecifier);
        } else if (strncmp(suffix, kRsrcForkSpecifier, strlen(kRsrcForkSpecifier)) == 0) {
            *forkIndexPtr = 1;
            cn->cn_consume = strlen(kRsrcForkSpecifier);
        }
    }
    
    // If we're looking up a resource fork to delete or rename it, that's just wrong 
    // and we should nip it in the bud.  I don't think this is strictly necessary 
    // (after all we're a read-only file system, but even if we weren't we'd want to make 
    // this check in our VNOPRemove and VNOPRename entry points), but HFS does it this 
    // way and I'm reticent to ignore that advice.  All-in-all, I can't see this check 
    // actively causing problems.
 
    if ( (err == 0) && (*forkIndexPtr != 0) && ((cn->cn_nameiop == DELETE) || (cn->cn_nameiop == RENAME)) ) {
        err = EPERM;
    }
    
    return err;
}
 
static errno_t SearchDirectoryByName(
    FSMount *               fsmp, 
    struct componentname *  cn, 
    uint16_t *              dirBlockPtr, 
    size_t *                dirOffsetPtr, 
    MFSForkInfo             forkInfo[], 
    struct vnode_attr *     attr
)
    // Searches the MFS directory on a volume (represented by fsmp) for a directory entry 
    // based on its name (referenced by cn), and returns various attributes of that 
    // directory entry (*dirBlockPtr, *dirOffsetPtr, forkInfo[0] (data fork info), 
    // forkInfo[1] (rsrc fork info) and, optionally, attr).
{
    int         err;
    void *      tempBuffer;
    uint16_t    dirBlock;
    size_t      dirOffset;
    
    assert(ValidFSMount(fsmp));
    assert(cn != NULL);
    assert(dirBlockPtr != NULL);
    assert(dirOffsetPtr != NULL);
    assert(forkInfo != NULL);
    // attr can be NULL
 
    // Create the temporary buffer used by MFSDirectoryBlockFindEntryByName.
 
    err = 0;
    tempBuffer = OSMalloc(kMFSDirectoryBlockFindEntryByNameTempBufferSize, gOSMallocTag);
    if (tempBuffer == NULL) {
        err = ENOMEM;
    }
 
    // Iterate through the directory blocks, reading them into memory, and then calling 
    // the MFS core to look for the directory item.
    
    if (err == 0) {
        // MFSDirectoryBlockFindEntryByname requires that we clear the first byte of 
        // tempBuffer to tell it that it hasn't seen this buffer before.
        
        *((char *) tempBuffer) = 0;
 
        dirBlock = fsmp->fDirectoryStartBlock;
        do {
            buf_t           buf;
            const void *    bufData;
            
            buf = NULL;
            
            err = buf_meta_bread(fsmp->fBlockDevVNode, dirBlock, fsmp->fBlockDevBlockSize, NULL, &buf);
 
            if (err == 0) {
                bufData = (const void *) buf_dataptr(buf);
                assert(bufData != NULL);
 
                err = MFSDirectoryBlockFindEntryByName(
                    bufData,
                    fsmp->fBlockDevBlockSize,
                    cn->cn_nameptr,
                    cn->cn_namelen,
                    tempBuffer,
                    &dirOffset,
                    attr
                );
                
                // If we found the item, return its fork info as well.
                
                if (err == 0) {
                    err = MFSDirectoryEntryGetForkInfo(bufData, dirOffset, 0, &forkInfo[0]);
                }
                if (err == 0) {
                    err = MFSDirectoryEntryGetForkInfo(bufData, dirOffset, 1, &forkInfo[1]);
                }
 
                // If we didn't find the item, try the next directory block.
                
                if (err == ENOENT) {
                    dirBlock += 1;
                    
                    if (dirBlock < (fsmp->fDirectoryStartBlock + fsmp->fDirectoryBlockCount)) {
                        err = EAGAIN;
                    }
                }
            }
            
            if (buf != NULL) {
                buf_brelse(buf);
            }
        } while (err == EAGAIN);
    }
    
    // Copy the results out to the caller.
    
    if (err == 0) {
        *dirBlockPtr  = dirBlock;
        *dirOffsetPtr = dirOffset;
    }
    
    // Clean up.
    
    if (tempBuffer != NULL) {
        OSFree(tempBuffer, MAXPATHLEN, gOSMallocTag);
    }
 
    // Post-conditions
    
    assert( (err != 0) || ((*dirBlockPtr >= fsmp->fDirectoryStartBlock) && (*dirBlockPtr < (fsmp->fDirectoryStartBlock + fsmp->fDirectoryBlockCount))) );
    assert( (err != 0) || (*dirOffsetPtr < fsmp->fBlockDevBlockSize) );
    
    return err;
}
 
static errno_t SearchDirectoryByID(
    FSMount *           fsmp, 
    ino_t               ino, 
    uint16_t *          dirBlockPtr, 
    size_t *            dirOffsetPtr, 
    MFSForkInfo         forkInfo[], 
    struct vnode_attr * attr
)
    // Searches the MFS directory on a volume (represented by fsmp) for a directory 
    // entry based on its file number (ino), and returns various attributes of that 
    // directory entry (*dirBlockPtr, *dirOffsetPtr, forkInfo[0] (data fork info), 
    // forkInfo[1] (rsrc fork info) and, optionally, attr).
{
    int         err;
    uint16_t    dirBlock;
    size_t      dirOffset;
    
    assert(ValidFSMount(fsmp));
    // ino can be anything
    assert(dirBlockPtr != NULL);
    assert(dirOffsetPtr != NULL);
    assert(forkInfo != NULL);
    // attr can be NULL
 
    // Iterate through the directory blocks, reading them into memory, and then calling 
    // the MFS core to iterate through each item in the block, looking for the one 
    // with the correct file number.
 
    dirBlock = fsmp->fDirectoryStartBlock;
    do {
        buf_t           buf;
        const void *    bufData;
        boolean_t       found;
        
        buf = NULL;
        
        err = buf_meta_bread(fsmp->fBlockDevVNode, dirBlock, fsmp->fBlockDevBlockSize, NULL, &buf);
 
        if (err == 0) {
            bufData = (const void *) buf_dataptr(buf);
            assert(bufData != NULL);
 
            found = FALSE;
            dirOffset = kMFSDirectoryBlockIterateFromStart;
            do {
                struct vnode_attr tmpAttr;
                VATTR_INIT(&tmpAttr);
                VATTR_WANTED(&tmpAttr, va_fileid);
 
                err = MFSDirectoryBlockIterate(
                    bufData,
                    fsmp->fBlockDevBlockSize,
                    &dirOffset,
                    &tmpAttr
                );
                if (err == 0) {
                    found = (tmpAttr.va_fileid == ino);
                }
            } while ( (err == 0) && ! found);
            
            // If we found the item, return its attributes and fork info as well.
            
            if ( (err == 0) && (attr != NULL) ) {
                err = MFSDirectoryEntryGetAttr(bufData, dirOffset, attr);
            }
            if (err == 0) {
                err = MFSDirectoryEntryGetForkInfo(bufData, dirOffset, 0, &forkInfo[0]);
            }
            if (err == 0) {
                err = MFSDirectoryEntryGetForkInfo(bufData, dirOffset, 1, &forkInfo[1]);
            }
            
            // If we didn't find the item, try the next directory block.
 
            if (err == ENOENT) {
                dirBlock += 1;
                
                if (dirBlock < (fsmp->fDirectoryStartBlock + fsmp->fDirectoryBlockCount)) {
                    err = EAGAIN;
                }
            }
        }
        
        if (buf != NULL) {
            buf_brelse(buf);
        }
 
    } while (err == EAGAIN);
 
    // Copy the results out to the caller.
 
    if (err == 0) {
        *dirBlockPtr  = dirBlock;
        *dirOffsetPtr = dirOffset;
    }
 
    // Post-conditions
    
    assert( (err != 0) || ((*dirBlockPtr >= fsmp->fDirectoryStartBlock) && (*dirBlockPtr < (fsmp->fDirectoryStartBlock + fsmp->fDirectoryBlockCount))) );
    assert( (err != 0) || (*dirOffsetPtr < fsmp->fBlockDevBlockSize) );
    
    return err;
}
 
static errno_t FSNodeGetOrCreateFileVNodeByName(FSMount *fsmp, struct componentname *cn, vnode_t dirVN, vnode_t *vnPtr)
    // Gets the file vnode for the file whose name is referenced by cn, or 
    // creates one if none exists.  This forms the core of VNOPLookup.
    //
    // fsmp must point to a valid FSMount.
    //
    // cn must point to a componentname structure specifying the file name to 
    // look up.
    //
    // dirVN must be the directory containing the file; for MFS, this is necessarily 
    // the root directory vnode (because that's the only directory!).
    //
    // vnPtr must not be NULL.  On error, *vnPtr will be NULL.  On success, 
    // *vnPtr will be a vnode with an I/O reference that the caller is 
    // responsible for releasing.
    //
    // The overall structure of this routine is dictated by the architecture 
    // of the hash layer; see the comments in "HashNode.h" for details.
{
    int                 err;
    vnode_t             vn;
    HNodeRef            hn;
    FSNode *            fsn;
    uint16_t            dirBlock;
    size_t              dirOffset;
    struct vnode_attr   attr;
    MFSForkInfo         forkInfo[2];
    size_t              forkIndex;
    
    dirBlock = 0;       // quieten warning
    dirOffset = 0;      // quieten warning
    
    assert(ValidFSMount(fsmp));
    assert(cn != NULL);
    assert(dirVN != NULL);
    assert( vnPtr != NULL);
    assert(*vnPtr == NULL);
    
    hn = NULL;
    vn = NULL;
    
    forkIndex = 0;
    
    // Because we don't know the inode number, we have to search the disk /before/ doing the 
    // hash layer lookup.
    
    VATTR_INIT(&attr);
    VATTR_WANTED(&attr, va_fileid);
 
    err = SearchDirectoryByName(fsmp, cn, &dirBlock, &dirOffset, forkInfo, &attr);
 
    // And then see if the user supplied a fork specifier.
    
    if (err == 0) {
        err = CheckForForkSpecifier(cn, &forkIndex);
    }
 
    // Now we can look it up in the hash table.
    
    if (err == 0) {
        assert( (attr.va_fileid & 0xFFFFFFFF00000000LL) == 0 );
        
        err = HNodeLookupCreatingIfNecessary(fsmp->fBlockRDevNum, (ino_t) attr.va_fileid, forkIndex, &hn, &vn);
    };
    
    if ( (err == 0) && (vn == NULL) ) {
        struct vnode_fsparam    params;
 
        fsn = FSNodeFromHNodeUnchecked(hn);
        
        // If this is a new FSNode, initialise it.
        
        if ( ! fsn->fInitialised ) {
            fsn->fMagic       = kFSNodeMagic;
            fsn->fInitialised = TRUE;
            fsn->fDirBlock    = dirBlock;
            fsn->fDirOffset   = dirOffset;
            fsn->fForkInfo[0] = forkInfo[0];
            fsn->fForkInfo[1] = forkInfo[1];
        }
 
        // Try to create the vnode.
 
        params.vnfs_mp         = fsmp->fMountPoint;
        params.vnfs_vtype      = VREG;
        params.vnfs_str        = NULL;
        params.vnfs_dvp        = dirVN;
        params.vnfs_fsnode     = hn;
        params.vnfs_vops       = gVNodeOperations;
        params.vnfs_markroot   = FALSE;
        params.vnfs_marksystem = FALSE;
        params.vnfs_rdev       = 0;                 // we don't currently support VBLK or VCHR
        params.vnfs_filesize   = fsn->fForkInfo[forkIndex].lengthInBytes;
 
        // Name caching is completely disabled until I can work through all of the issues.  
        // Specifically, HFS Plus won't cache a precomposed name, and I think I should 
        // do the same.
        
        params.vnfs_cnp        = NULL;
        params.vnfs_flags      = VNFS_NOCACHE | VNFS_CANTCACHE;
        
        err = vnode_create(VNCREATE_FLAVOR, sizeof(params), &params, &vn);
        
        assert( (err == 0) == (vn != NULL) );
        
        // Complete our contract with the hash layer.
 
        if (err == 0) {
            HNodeAttachVNodeSucceeded(hn, forkIndex, vn);
        } else {
            HNodeAttachVNodeFailed(hn, forkIndex);
            FSNodeScrub(fsn);
            HNodeScrubDone(hn);
        }
    }
 
    if (err == 0) {
        *vnPtr = vn;
    }
 
    assert( (err == 0) == (*vnPtr != NULL) );
    
    return err;
}
 
static errno_t FSNodeGetOrCreateFileVNodeByID(FSMount *fsmp, ino_t ino, size_t forkIndex, vnode_t *vnPtr)
    // Gets the file vnode for a given fork within a given file number, or 
    // creates one if none exists.  Theh ability to find a vnode by its ID 
    // is critical to supporting volfs (see VFSOPVget), and we also use it 
    // to find the resource fork for a given data fork when reading extended 
    // attributes.
    //
    // fsmp must point to a valid FSMount.
    //
    // ino in the file number of the file whose vnode we're looking for.
    //
    // forkIndex is the fork whose vnode we're looking for.
    //
    // vnPtr must not be NULL.  On error, *vnPtr will be NULL.  On success, 
    // *vnPtr will be a vnode with an I/O reference that the caller is 
    // responsible for releasing.
    //
    // The overall structure of this routine is dictated by the architecture 
    // of the hash layer; see the comments in "HashNode.h" for details.
{
    int                 err;
    int                 junk;
    vnode_t             vn;
    HNodeRef            hn;
    FSNode *            fsn;
    vnode_t             dirVN;
    
    assert(ValidFSMount(fsmp));
    assert(ino >= kMFSFirstFileInodeName);      // all file inode numbers are greater than 16; the only other inode number is 2 for the root
    assert(forkIndex <= 1);
    assert( vnPtr != NULL);
    assert(*vnPtr == NULL);
    
    hn = NULL;
    vn = NULL;
    dirVN = NULL;
 
    // Because we know the inode number, we can do the hash layer lookup before hitting the 
    // disk to search the directory.  This will speed things up in the case where we already 
    // have a hash node for the item.
    
    err = HNodeLookupCreatingIfNecessary(fsmp->fBlockRDevNum, ino, forkIndex, &hn, &vn);
    
    if ( (err == 0) && (vn == NULL) ) {
        struct vnode_fsparam    params;
 
        fsn = FSNodeFromHNodeUnchecked(hn);
        
        // If this is a new FSNode, initialise it.
        
        if ( ! fsn->fInitialised ) {
            fsn->fMagic       = kFSNodeMagic;
            fsn->fInitialised = TRUE;
 
            err = SearchDirectoryByID(fsmp, ino, &fsn->fDirBlock, &fsn->fDirOffset, fsn->fForkInfo, NULL);
            
            // The parent of all file vnodes is the root.  That sounds pretty obvious, but it's 
            // actually a bit tricky.  Specifically, the vnodes that represent the resource fork 
            // of a file also have their parent set to the root, not, for example to NULL, or to the 
            // file itself.  This is inline with the HFS Plus implementation.
            
            if (err == 0) {
                err = FSNodeGetOrCreateRootVNode(fsmp, &dirVN);
            }
        }
 
        // Try to create the vnode.
 
        if (err == 0) {
            params.vnfs_mp         = fsmp->fMountPoint;
            params.vnfs_vtype      = VREG;
            params.vnfs_str        = NULL;
            params.vnfs_dvp        = dirVN;
            params.vnfs_fsnode     = hn;
            params.vnfs_vops       = gVNodeOperations;
            params.vnfs_markroot   = FALSE;
            params.vnfs_marksystem = FALSE;
            params.vnfs_rdev       = 0;                 // we don't currently support VBLK or VCHR
            params.vnfs_filesize   = fsn->fForkInfo[forkIndex].lengthInBytes;
 
            // Name caching is completely disabled until I can work through all of the issues.  
            // Specifically, HFS Plus won't cache a precomposed name, and I think I should 
            // do the same.
            
            params.vnfs_cnp        = NULL;
            params.vnfs_flags      = VNFS_NOCACHE | VNFS_CANTCACHE;
            
            err = vnode_create(VNCREATE_FLAVOR, sizeof(params), &params, &vn);
        }
        
        assert( (err == 0) == (vn != NULL) );
        
        // Complete our contract with the hash layer.
 
        if (err == 0) {
            HNodeAttachVNodeSucceeded(hn, forkIndex, vn);
        } else {
            if ( HNodeAttachVNodeFailed(hn, forkIndex) ) {
                FSNodeScrub(fsn);
                HNodeScrubDone(hn);
            }
        }
    }
 
    if (err == 0) {
        *vnPtr = vn;
    }
    if (dirVN != NULL) {
        junk = vnode_put(dirVN);
        assert(junk == 0);
    }
 
    assert( (err == 0) == (*vnPtr != NULL) );
    
    return err;
}
 
static errno_t FSNodeGetOrCreateVNodeByID(FSMount *fsmp, ino_t ino, size_t forkIndex, vnode_t *vnPtr)
    // Gets a vnode for the file system objects with the given inode number, or 
    // creates one if none exists.  This simply hands the off to either 
    // FSNodeGetOrCreateRootVNode (if the inode number is that of the root) or 
    // FSNodeGetOrCreateFileVNodeByID otherhwise.
    //
    // fsmp must point to a valid FSMount.
    //
    // ino in the inode number of the file system object whose vnode we're looking for.
    //
    // forkIndex is the fork whose vnode we're looking for.  It must be 0 if ino 
    // is that of the root.
    //
    // vnPtr must not be NULL.  On error, *vnPtr will be NULL.  On success, 
    // *vnPtr will be a vnode with an I/O reference that the caller is 
    // responsible for releasing.
{
    int     err;
    
    assert(ValidFSMount(fsmp));
    assert(forkIndex <= 1);
    assert( vnPtr != NULL);
    assert(*vnPtr == NULL);
 
    if (ino == kMFSRootInodeNumber) {
        assert(forkIndex == 0);
        
        err = FSNodeGetOrCreateRootVNode(fsmp, vnPtr);
    } else if (ino < kMFSFirstFileInodeName) {
        err = ENOENT;
    } else {
        err = FSNodeGetOrCreateFileVNodeByID(fsmp, ino, forkIndex, vnPtr);
    }
    
    assert( (err == 0) == (*vnPtr != NULL) );
    
    return err;
}
 
static errno_t FSNodeGetFinderInfo(FSMount *fsmp, FSNode *fsn, uint8_t *finderInfo)
    // Copies the Finder info for a file into the supplied buffer 
    // (which must be 16 bytes).
{
    int             err;
    buf_t           buf;
    const void *    bufData;
    
    assert(ValidFSMount(fsmp));
    assert(ValidFSNode(fsn));
    assert(finderInfo != NULL);
    
    buf = NULL;
    
    err = buf_meta_bread(fsmp->fBlockDevVNode, fsn->fDirBlock, fsmp->fBlockDevBlockSize, NULL, &buf);
 
    if (err == 0) {
        bufData = (const void *) buf_dataptr(buf);
        assert(bufData != NULL);
        
        err = MFSDirectoryEntryGetFinderInfo(bufData, fsn->fDirOffset, finderInfo);
    }
 
    if (buf != NULL) {
        buf_brelse(buf);
    }
    
    return err;
}
 
/////////////////////////////////////////////////////////////////////
#pragma mark ***** VNode Operations
 
static errno_t VNOPLookup(struct vnop_lookup_args *ap)
    // This is called by VFS to do a directory lookup.
    //
    // dvp is the directory to search.
    //
    // cnp describes the name to search for.  This is kinda complicated, although 
    // the comments in <sys/vnode.h> are pretty helpful.
    //
    // vpp is a pointer to a vnode where we return the found item.  The 
    // returned vnode must have an I/O reference, and the caller is responsible 
    // for releasing it.
    //
    // context identifies the calling process.
{
    errno_t                 err;
    vnode_t                 dvp;
    vnode_t *               vpp;
    struct componentname *  cnp;
    vfs_context_t           context;
    FSMount *               fsmp;
    vnode_t                 vn;
    
    // Unpack arguments
    
    dvp     = ap->a_dvp;
    vpp     = ap->a_vpp;
    cnp     = ap->a_cnp;
    context = ap->a_context;
    
    // Pre-conditions
    
    assert(dvp != NULL);
    assert(vnode_isdir(dvp));           // VFS already checks that dvp is a directory
    assert(vnode_isvroot(dvp));         // and the only directory we have is the root
    assert( ValidVNode(dvp) );
    assert(vpp != NULL);
    assert(cnp != NULL);
    assert(context != NULL);
    
    // Prepare for failure.
    
    vn = NULL;
 
    // Implementation
 
    fsmp = FSMountFromMount(vnode_mount(dvp));
    
    if (cnp->cn_flags & ISDOTDOT) {
        // Implement lookup for ".." (that is, the parent directory).  As we currently 
        // only support one directory (the root directory) and the parent of the root 
        // is always the root, this is trivial (and, incidentally, exactly the same 
        // as the code for ".", but that wouldn't be true in a more general VFS plug-in).
        // We just get an I/O reference on dvp and return that.
 
        err = vnode_get(dvp);
        if (err == 0) {
            vn = dvp;
        }
    } else if ( (cnp->cn_namelen == 1) && (cnp->cn_nameptr[0] == '.') ) {
        // Implement lookup for "." (that is, this directory).  Just get an I/O reference 
        // to dvp and return that.
        
        err = vnode_get(dvp);
        if (err == 0) {
            vn = dvp;
        }
    } else {
        // For real directory items, do the real work in FSNodeGetOrCreateFileVNodeByName.
        
        err = FSNodeGetOrCreateFileVNodeByName(fsmp, cnp, dvp, &vn);
    }
    
    // Under all circumstances we set *vpp to vn.  That way, we satisfy the 
    // post-condition, regardless of what VFS uses as the initial value for 
    // *vpp.
    
    *vpp = vn;
    
    // Post-conditions
    
    assert( (err == 0) == (*vpp != NULL) );
    
    return err;
}
 
static errno_t VNOPGetattr(struct vnop_getattr_args *ap)
    // Called by VFS to get information about a vnode (this is called by the 
    // VFS implementation of <x-man-page://2/stat> and <x-man-page://2/getattrlist>).
    //
    // vp is the vnode whose information is requested.
    //
    // vap describes the attributes requested and the place to store the results.
    //
    // context identifies the calling process.
    //
    // You have two options for doing this:
    //
    // o For attributes whose values you have readily available, use the VATTR_RETURN 
    //   macro to unilaterally return the value.
    //
    // o For attributes whose values are hard to calculate, use VATTR_IS_ACTIVE to see 
    //   if the caller requested the attribute and, if so, copy the value into the 
    //   appropriate field.
    //
    // Our implementation has two cases:
    //
    // o For the root vnode, we return a bunch of static values.
    //
    // o For file vnodes, we pass off the work to the MFS core.
{
    int                 err;
    vnode_t             vp;
    struct vnode_attr * vap;
    vfs_context_t       context;
    FSMount *           fsmp;
    FSNode *            fsn;
 
    // Unpack arguments
 
    vp      = ap->a_vp;
    vap     = ap->a_vap;
    context = ap->a_context;
 
    // Pre-conditions
    
    assert( ValidVNode(vp) );
    assert(vap != NULL);
    assert(context != NULL);
 
    // Implementation
    
    fsmp = FSMountFromMount(vnode_mount(vp));
    fsn = FSNodeFromVNode(vp);
 
    if (vnode_isdir(vp)) {
        struct vfs_attr     volAttr;
 
        // For the root vnode, return a bunch of static data, plus some stuff that we 
        // crib from the volume itself.
 
        assert( vnode_isvroot(vp) );
        
        VFSATTR_INIT(&volAttr);
        VFSATTR_WANTED(&volAttr, f_filecount);
        VFSATTR_WANTED(&volAttr, f_create_time);
        VFSATTR_WANTED(&volAttr, f_modify_time);
        VFSATTR_WANTED(&volAttr, f_access_time);
        VFSATTR_WANTED(&volAttr, f_backup_time);
        
        err = MFSMDBGetAttr(fsmp->fMDBVABM, &volAttr);
        
        if (err == 0) {
            VATTR_RETURN(vap, va_rdev,        0);
            assert(VFSATTR_IS_SUPPORTED(&volAttr, f_filecount));
            VATTR_RETURN(vap, va_nlink,       2 + volAttr.f_filecount);     // traditional for directories
//          VATTR_RETURN(vap, va_total_size,  xxx);
//          VATTR_RETURN(vap, va_total_alloc, xxx);
            VATTR_RETURN(vap, va_data_size,   volAttr.f_filecount * sizeof(struct dirent));
//          VATTR_RETURN(vap, va_data_alloc,  xxx);
//          VATTR_RETURN(vap, va_iosize,      xxx);
 
//          VATTR_RETURN(vap, va_uid,   xxx);
//          VATTR_RETURN(vap, va_gid,   xxx);
            VATTR_RETURN(vap, va_mode,  S_IFDIR | S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
//          VATTR_RETURN(vap, va_flags, xxx);
//          VATTR_RETURN(vap, va_acl,   xxx);
 
            vap->va_create_time = volAttr.f_create_time;
            if (VFSATTR_IS_SUPPORTED(&volAttr, f_create_time)) {
                VATTR_SET_SUPPORTED(vap, va_create_time);
            }
            vap->va_modify_time = volAttr.f_modify_time;
            if (VFSATTR_IS_SUPPORTED(&volAttr, f_modify_time)) {
                VATTR_SET_SUPPORTED(vap, va_modify_time);
            }
            vap->va_access_time = volAttr.f_access_time;
            if (VFSATTR_IS_SUPPORTED(&volAttr, f_access_time)) {
                VATTR_SET_SUPPORTED(vap, va_access_time);
            }
            vap->va_change_time.tv_sec = 0;     // don't have to claim support for va_change_time because VFS sets it to va_modify_time
            vap->va_change_time.tv_nsec = 0;
            vap->va_backup_time = volAttr.f_backup_time;
            if (VFSATTR_IS_SUPPORTED(&volAttr, f_backup_time)) {
                VATTR_SET_SUPPORTED(vap, va_backup_time);
            }
 
            VATTR_RETURN(vap, va_fileid,   kMFSRootInodeNumber);
//          VATTR_RETURN(vap, va_linkid,   xxx);
            VATTR_RETURN(vap, va_parentid, kMFSRootParentInodeNumber);
//          VATTR_RETURN(vap, va_fsid,     mtmp->fBlockRDevNum);
//          VATTR_RETURN(vap, va_filerev,  xxx);
//          VATTR_RETURN(vap, va_gen,      xxx);
 
            VATTR_RETURN(vap, va_encoding, 0);                  // MacRoman
 
//          VATTR_RETURN(vap, va_type,  xxx);                   // handled by VFS
//          VATTR_RETURN(vap, va_name,  xxx);                   // let VFS get this from f_mntonname
//          VATTR_RETURN(vap, va_uuuid, xxx);
//          VATTR_RETURN(vap, va_guuid, xxx);
 
            VATTR_RETURN(vap, va_nchildren, volAttr.f_filecount);
        }
    } else {
        buf_t               buf;
        const void *        bufData;
 
        // For a file vnode, call MFS core to do the real work.  Of course, we have to make 
        // sure that the file's directory block is available to the core.
 
        buf = NULL;
        
        err = buf_meta_bread(fsmp->fBlockDevVNode, fsn->fDirBlock, fsmp->fBlockDevBlockSize, NULL, &buf);
        
        if (err == 0) {
            bufData = (const void *) buf_dataptr(buf);
            assert(bufData != NULL);
 
            err = MFSDirectoryEntryGetAttr(bufData, fsn->fDirOffset, vap);
        }
        
        if (buf != NULL) {
            buf_brelse(buf);
        }
        
        // If this is the resource fork, override the values for va_data_size and va_data_alloc 
        // returned by MFSDirectoryEntryGetAttr (which are the data fork values) with the values 
        // for the resource fork.  This seems pretty logical, and it's what HFS does, but it goes 
        // against the comments in <sys/vnode.h> that say that va_data_size and va_data_alloc are 
        // for the data fork.  However, if you don't do it this way, stuff like:
        //
        //      forkLength = lseek(resourceForkFD, 0, SEEK_END);
        //
        // returns the data fork length, not the resource fork length.  Ouch!
        //
        // This contradiction is <rdar://problem/4642760>.
 
        if (err == 0) {
            if ( HNodeGetForkIndexForVNode(vp) == 1 ) {
                VATTR_RETURN(vap, va_data_size,   fsn->fForkInfo[1].lengthInBytes);
                VATTR_RETURN(vap, va_data_alloc,  fsn->fForkInfo[1].physicalLengthInBytes);
            }
        }
    }
    
    return err;
}
 
static errno_t VNOPPathconf(struct vnop_pathconf_args *ap)
    // Called by VFS to get configuration information about a vnode.
    //
    // vp is the vnode whose information is requested.
    //
    // name is the pathconf value being requested.
    //
    // retvalPtr is a place to store the resulting value.
    //
    // context identifies the calling process.
{
    int                 err;
    vnode_t             vp;
    int                 name;
    register_t *        retvalPtr;
    vfs_context_t       context;
 
    // Unpack arguments
 
    vp        = ap->a_vp;
    name      = ap->a_name;
    retvalPtr = ap->a_retval;
    context   = ap->a_context;
 
    // Pre-conditions
    
    assert( ValidVNode(vp) );
    assert(retvalPtr != NULL);
    assert(context != NULL);
 
    // Implementation
    
    err = 0;
    switch (name) {
        case _PC_LINK_MAX:
            *retvalPtr = 1;     // no hard link support
            break;
        case _PC_NAME_MAX:
            *retvalPtr = __DARWIN_MAXNAMLEN;
            break;
        case _PC_PATH_MAX:
            *retvalPtr = MAXPATHLEN;
            break;
        case _PC_PIPE_BUF:
            *retvalPtr = PIPE_BUF;
            break;
        case _PC_CHOWN_RESTRICTED:
            *retvalPtr = 1;     // it would be if we supported it (-:
            break;
        case _PC_NO_TRUNC:
            *retvalPtr = 0;     // we would error (not truncate) if you tried to create a file too long (if we were read/write :-)
            break;
        case _PC_NAME_CHARS_MAX:
            *retvalPtr = 255;   // *** what's this about?
            break;
        case _PC_CASE_SENSITIVE:
            *retvalPtr = 1;
            break;
        case _PC_CASE_PRESERVING:
            *retvalPtr = 1;
            break;
 
        // The following are implemented by VFS:
        
        case _PC_EXTENDED_SECURITY_NP:
        case _PC_AUTH_OPAQUE_NP:
            assert(FALSE);              // it would be weird if these got through to us
            // fall through
        
        // The following are terminal device stuff that we don't support:
        
        case _PC_MAX_CANON:
        case _PC_MAX_INPUT:
        case _PC_VDISABLE:
 
        default:
            err = EINVAL;
            break;
    }
    
    return err;
}
 
 
static errno_t CopyOutDirEnt(uio_t uio, ino_t ino, uint8_t type, struct dirent *dirEntBuf)
    // Copy a directory entry (struct dirent) out to user space, complying with 
    // all of the requirements of <x-man-page://5/dirent>.
    //
    // uio describes the buffer to where we copy the data.  There's no guarantee 
    // that this is big enough to hold the entire directory entry.  If it isn't, 
    // we return ENOBUFS.
    //
    // ino and type are placed in the fixed fields of the (struct dirent).
    //
    // dirEntBuf is a pointer to a (struct dirent) whose d_name field has been filled out. 
    // The name in this buffer could be much longer than the upper limit for 
    // a (struct dirent) (which is __DARWIN_MAXNAMLEN), and this routine will 
    // silently truncate it if necessary.  This means that the string in the 
    // buffer is /not/ preserved across the call.
{
    int     err;
    size_t  nameLen;
    
    assert(uio != NULL);
    assert(dirEntBuf != NULL);
    assert(dirEntBuf->d_name[0] != 0);          // the caller must have filled out the name already
 
    // The MFS core code can return names with a size up to MAXPATHLEN 
    // (1024) bytes.  This is fine, in general, but it causes problems 
    // for this code because a dirent name is not allowed to be longer 
    // than __DARWIN_MAXNAMLEN + 1 (256) bytes.  Unfortunately this isn't 
    // just a theoretic problem because MFS names can be up to 255 
    // MacRoman characters, and with a maximum MacRoman-to-UTF-8 expansion 
    // of 3x, we can easily blow the __DARWIN_MAXNAMLEN limit.
    //
    // So, we have to truncate the name.  To make this work 100% correctly, 
    // I should probably do some sort of name mangling, ala the File Manager 
    // FSSpec code, which inserts the file number of the file into the name 
    // so that it can find it even with a truncated name.  However, that's 
    // way too much work.  So, I've decided to just truncate the name in the 
    // most direct way possible.
    //
    // Still, I don't want to return illegal UTF-8, so I only truncate at 
    // a valid UTF-8 start character.  If you look at the UTF-8 specification, 
    // (see RFC 2279 for a quick summary), you'll see that a UTF-8 character 
    // is an invalid start character if the top two bits are set.  This code 
    // goes backwards through the string looking for the first start character 
    // that would give us a length less than __DARWIN_MAXNAMLEN.
    //
    // Performance wise, I should probably start this search close to 
    // __DARWIN_MAXNAMLEN, rather than come all the way back from nameLen 
    // (which could potentially be much greater than __DARWIN_MAXNAMLEN), 
    // but I don't anticipate this happening enough to warrant the effort.
    //
    // Also, this code could potentially break the string at a non-MacRoman 
    // boundary.  For example, if you have "o umlaut" in the string, this 
    // will come back decomposed as "o" and "composing diaresis", and this 
    // could break those up.  I just don't care.  It's not like you're going 
    // to be able to VNOPLookup a truncated name anyway.
    
    nameLen = strlen(dirEntBuf->d_name);
    while (nameLen > __DARWIN_MAXNAMLEN) {
        // Loop invariant is that name[nameLen] is a valid UTF-8 start character 
        // (with the edge case that 0 is considered a valid start character, which is 
        // a case that crops up on the initial iteration).
        
        assert((dirEntBuf->d_name[nameLen] & 0xC0) != 0xC0);
 
        assert(nameLen > 0);                    // it should be impossible to run off the front of the buffer 
        nameLen -= 1;                           // but this assert is just to be sure
        while ((dirEntBuf->d_name[nameLen] & 0xC0) == 0xc0) {
            assert(nameLen > 0);                // likewise
            nameLen -= 1;
        }
    }
    dirEntBuf->d_name[nameLen] = 0;
    
    // Make sure that any pad bytes we copy out are zero.  There is guaranteed 
    // to be space for this because dirEntBuf->d_name has a size of MAXPATHLEN 
    // (1024) but nameLen must necessarily be __DARWIN_MAXNAMLEN (255) or less.
    
    dirEntBuf->d_name[nameLen + 1] = 0;
    dirEntBuf->d_name[nameLen + 2] = 0;
    dirEntBuf->d_name[nameLen + 3] = 0;
 
    // Set up the fixed fields of the dirent.  Note that <x-man-page://5/dirent> 
    // requires that d_reclen be evenly divisible by 4.
    
    dirEntBuf->d_fileno = ino;
    dirEntBuf->d_reclen = (offsetof(struct dirent, d_name) + nameLen + 1 + 3) & ~3;     // +1 to include null, +3 & 3 to round to next 4 byte boundary
    dirEntBuf->d_type   = type;
    dirEntBuf->d_namlen = nameLen;
    
    // Copy out the dirent, if we have space for all of it.
 
    if ( dirEntBuf->d_reclen > uio_resid(uio) ) {
        err = ENOBUFS;
    } else {
        err = uiomove( (caddr_t) dirEntBuf, dirEntBuf->d_reclen, uio );
    }
    
    return err;
}
 
static errno_t ReadDirectoryAndCopyOutDirEnt(
    FSMount *       fsmp, 
    uint16_t *      dirBlockPtr, 
    size_t *        dirOffsetPtr, 
    boolean_t *     trustDirOffsetPtr,
    int *           numdirentPtr, 
    uio_t           uio, 
    struct dirent * dirEntBuf
)
    // Read the directory on the volume specified by fsmp, and copy directory entries 
    // out to the user's buffer (described by uio).  dirEntBuf must point to a temporary 
    // (struct dirent) buffer thats big enough to hold a MAXPATHLEN size name.
    //
    // For each directory entry successfully copied out, *numdirentPtr is incremented.
    //
    // On entry, *dirBlockPtr and *dirOffsetPtr control where in the directory we start 
    // reading.  On exit, they are updated to reflect the entries that were read, so that 
    // subsequent calls will resume at the right place.
    //
    // A special value of kMFSDirectoryBlockIterateFromStart is both accepted and returned, 
    // indicating that the first directory entry of a particular block should be read next.  
    // The logic here pretty much follows from that in MFSDirectoryBlockIterate.
    //
    // Finally, if *trustDirOffsetPtr is false, we confirm that *dirOffsetPtr is valid 
    // before using it.  If that fails, we return EINVAL.  If it succeeds, we set 
    // *trustDirOffsetPtr to true so that a) we don't confirm dirOffset again, and 
    // b) so that our trusted offset gets recorded in the dirOffset cache 
    // (the fLastDirOffset of the FSNode).
{
    int                 err;
    uint16_t            dirBlockLimit;
    uint16_t            dirBlock;
    size_t              dirOffset;
    struct vnode_attr   attr;
 
    // Pre-conditions:
    
    assert( ValidFSMount(fsmp) );
    assert(dirBlockPtr       != NULL);
    assert(dirOffsetPtr      != NULL);
    assert(trustDirOffsetPtr != NULL);
    assert(numdirentPtr      != NULL);
    assert(uio != NULL);
    assert(dirEntBuf != NULL);
 
    dirBlockLimit = fsmp->fDirectoryStartBlock + fsmp->fDirectoryBlockCount;
 
    // Extract incoming dirBlock and dirOffset, translating the special dirOffset 
    // value to the equivalent value for MFS core.
    
    dirBlock  = *dirBlockPtr;
    dirOffset = *dirOffsetPtr;
    
    // Iterate the directory blocks, iterating through the directory entries, copying them 
    // out to the user buffer.  Note that dirBlock may come in as dirLimit, which means 
    // we've run off the end of the directory and we return no data.
    
    err = 0;
    while ( (err == 0) && (dirBlock < dirBlockLimit) ) {
        buf_t           buf;
        const void *    bufData;
        
        buf = NULL;
 
        // Bring in the directory block.
        
        err = buf_meta_bread(fsmp->fBlockDevVNode, dirBlock, fsmp->fBlockDevBlockSize, NULL, &buf);
        if (err == 0) {
            bufData = (const void *) buf_dataptr(buf);
            assert(bufData != NULL);
        }
        
        // If we don't trust dirOffset, check it now.
        
        if ( (err == 0) && ! *trustDirOffsetPtr ) {
            assert(dirOffset != kMFSDirectoryBlockIterateFromStart);        // if was this, we would've trusted it
 
            err = MFSDirectoryBlockCheckDirOffset(bufData, fsmp->fBlockDevBlockSize, dirOffset);
            if (err == 0) {
                *trustDirOffsetPtr = TRUE;
            }
        }
        
        // Iterate the entries in this block.  Note that we start at dirOffset, which 
        // might not be the beginning.
 
        if (err == 0) {
            do {
                size_t      previousDirOffset;
 
                // Tell the MFS core that we want the file number and the name.
                
                VATTR_INIT(&attr);
                VATTR_WANTED(&attr, va_fileid);
                VATTR_WANTED(&attr, va_name);
                attr.va_name = dirEntBuf->d_name;
 
                previousDirOffset = dirOffset;          // see comment below
                err = MFSDirectoryBlockIterate(bufData, fsmp->fBlockDevBlockSize, &dirOffset, &attr);
                
                // Copy the entry out to the user's buffer.
                
                if (err == 0) {
                    err = CopyOutDirEnt(uio, attr.va_fileid, DT_REG, dirEntBuf);
                }
                if (err == 0) {
                    *numdirentPtr += 1;
                } else if (err == ENOBUFS) {
 
                    // If we failed to copy this directory entry to the user's buffer because 
                    // the buffer was full, we need to move dirOffset back to its original value 
                    // so that, the next time the client calls getdirentries, they get this 
                    // directory entry.  I could've preflighted this (checked that the buffer 
                    // was big enough to hold a dirent), but I decided against this because 
                    // CopyOutDirEnt only uiomove's the bytes that it needs to return the name.  
                    // This makes the preflight approach difficult because I don't know the 
                    // length of the name until I've called MFSDirectoryBlockIterate, and that 
                    // modifies dirOffset.
                    
                    dirOffset = previousDirOffset;
                }
            } while (err == 0);
        }
        
        if (buf != NULL) {
            buf_brelse(buf);
        }
        
        // We successfully iterated this entire block; move on to the next one.
        
        if (err == ENOENT) {
            dirBlock  += 1;
            dirOffset = kMFSDirectoryBlockIterateFromStart;
            err = 0;
        }
    }
 
    // Return dirBlock and dirOffset to the caller.
    
    *dirOffsetPtr = dirOffset;
    *dirBlockPtr  = dirBlock;
    
    return err;
}
 
// When packing a dirOffset into the UIO offset, we substitute kStartOfBlockMagicOffset
// for the MFS core value of kMFSDirectoryBlockIterateFromStart, because the MFS core 
// value is too big to fit into the 16-bit field we have available.
 
enum {
    kStartOfBlockMagicOffset = 0x7362           // 'sb'
};
 
static errno_t UnpackUIOOffset(
    vnode_t     vn,
    uio_t       uio,
    uint16_t *  dirBlockPtr, 
    size_t *    dirOffsetPtr, 
    boolean_t * trustDirOffsetPtr
)
    // This routine extracts the offset from uio and unpacks it into a directory 
    // block number (*dirBlockPtr), a directory offset (*dirOffsetPtr), and a 
    // a value indicating whether the caller can trust the directory offset 
    // (*trustDirOffsetPtr).  It returns EINVAL if the UIO offset is obviously 
    // wrong.  It handles a world of special cases and validity tests.  Yetch.
{
    int         err;
    FSMount *   fsmp;
    FSNode *    fsn;
    off_t       uioOffset;
    uint16_t    dirBlock;
    uint16_t    dirBlockLimit;
    size_t      dirOffset;
    boolean_t   trustDirOffset;
    boolean_t   useCache;
    
    // Pre-conditions
    
    assert( ValidVNode(vn) );
    assert(dirBlockPtr != NULL);
    assert(dirOffsetPtr != NULL);
    assert(trustDirOffsetPtr != NULL);
    
    fsmp = FSMountFromMount(vnode_mount(vn));
    fsn = FSNodeFromVNode(vn);
    
    // Some basic checks of the algorithm
    
    assert(fsmp->fDirectoryStartBlock > 0);             // this algorithm just won't work if 0 is a valid directory block
    assert( (((uint32_t) fsmp->fDirectoryBlockCount) + fsmp->fDirectoryBlockCount) < 65536);
                                                        // or the directory extends past 16 bits of blocks
    assert(fsmp->fBlockDevBlockSize < 65536);           // or the block size (and hence dirOffset) exceeds 16 bits
    assert(kStartOfBlockMagicOffset < 65536);           // or the magic 'start of block' value exceeds 16 bits
    assert(kStartOfBlockMagicOffset >= fsmp->fBlockDevBlockSize);    // or the magic 'start of block' value is actually a valid dirOffset
 
    // In the debug build, the client can turn off directory offset caching.  
    // The cache was masking an interesting bug in the unpack code, so I 
    // want to turn it off to test the fix for that bug.
    
    useCache = TRUE;
    #if MACH_ASSERT
        useCache = ! vnode_isnocache(vn);
    #endif
    
    // Unpack dirBlock and dirOffset from the uio_offset.
    
    uioOffset = uio_offset(uio);
    dirBlock =  (uioOffset >> 16);
    dirOffset = (uioOffset & 0x0000FFFF);
    trustDirOffset = FALSE;
    
    dirBlockLimit = fsmp->fDirectoryStartBlock + fsmp->fDirectoryBlockCount;
    
    if ( (uioOffset < 0) || (uioOffset > 0x00000000FFFFFFFFLL) ) {
        // This is Just Wrong (tm).  This check makes the cast to uint32_t in the second 
        // special case acceptable.
        err = EINVAL;
    } else if ( (dirBlock == 0) && (dirOffset <= 1) ) {
        // First special case.  dirBlock 0 is assumed to hold the synthetic directory entries 
        // "." (at offset 0) and ".." (at offset 1).  We do this case before the next one so 
        // that the uioOffset == 0 case comes through here.  You'd get the same result in 
        // both cases, but it's just nicer if it comes through the right branch.
        err = 0;
        trustDirOffset = TRUE;
    } else if ( useCache && ((uint32_t) uioOffset == fsn->fLastDirOffset) ) {
        // Second special case: if the overall offset matches the last offset we returned to 
        // to the user, everything has got to be OK because directory offsets can't change 
        // on a read-only file system.
        err = 0;
        if (dirOffset == kStartOfBlockMagicOffset) {
            dirOffset = kMFSDirectoryBlockIterateFromStart;
        }
        trustDirOffset = TRUE;
    } else if ( (dirBlock < fsmp->fDirectoryStartBlock) || (dirBlock > dirBlockLimit) ) {
        // dirBlock out clearly of range
        err = EINVAL;
    } else if (dirOffset == kStartOfBlockMagicOffset) {
        // Third special case: starting at the beginning of a block is always OK (if 
        // dirBlock is OK, which we've just checked).  We have to do this before the 
        // following checks, because a) we want to check for the start of the block 
        // before we reject dirBlock == dirBlockLimit, and c) kStartOfBlockMagicOffset is 
        // greater than the block size.
        err = 0;
        dirOffset = kMFSDirectoryBlockIterateFromStart;
        trustDirOffset = TRUE;
    } else if (dirBlock == dirBlockLimit) {
        // Fourth special case: we allow dirBlock == dirBlockLimit iff dirOffset == 
        // kStartOfBlockMagicOffset.  This is what you get pinned to once you've 
        // iterated the entire directory's contents.  No other dirOffset values 
        // are valid if we're off the end of the directory.
        err = EINVAL;
    } else if (dirOffset >= fsmp->fBlockDevBlockSize) {
        // dirOffset is too big.
        err = EINVAL;
    } else {
        // Everything might be OK, but we can't trust dirOffset.
        err = 0;
        assert( ! trustDirOffset );
    }
    
    *dirBlockPtr       = dirBlock;
    *dirOffsetPtr      = dirOffset;
    *trustDirOffsetPtr = trustDirOffset;
    
    return err;
}
 
static void PackUIOOffset(
    vnode_t     vn,
    uio_t       uio,
    uint16_t    dirBlock, 
    size_t      dirOffset,
    boolean_t   trustDirOffset
)
    // This routine packs the directory block number (dirBloc) and directory offset 
    // (dirOffset) into the UIO's offset.  It also records the resulting offset 
    // in the directory offset cache of the FSNode associated with vn (that is, the 
    // fLastDirOffset field) if the value can be trusted.  [There are /very/ obscure 
    // circumstances under which we can get an untrusted value in, /not/ look at it 
    // (and therefore not check its validity and establish trust), and then write 
    // it back out.  In such a situation, we don't want to promote the untrusted 
    // offset to trusted by putting it in cache.]
{
    FSMount *   fsmp;
    FSNode *    fsn;
    off_t       uioOffset;
    
    assert(vn != NULL);
    assert(uio != NULL);
    
    fsmp = FSMountFromMount(vnode_mount(vn));
    fsn  = FSNodeFromVNode(vn);
 
    // Complex assertions...
    
    if (dirBlock == 0) {
        assert(dirOffset <= 1);
    } else {
        uint16_t    dirBlockLimit;
        
        dirBlockLimit = fsmp->fDirectoryStartBlock + fsmp->fDirectoryBlockCount;
 
        // It's a <= in the last following assert because, if we reach the end of the 
        // directory, dirBlock == dirBlockLimit.
 
        assert( (dirBlock >= fsmp->fDirectoryStartBlock) && (dirBlock <= dirBlockLimit) );
 
        // The following checks that, if dirBlock has hit the dirBlockLimit, dirOffset is 
        // the "start of block" token.
        
        assert( (dirBlock < dirBlockLimit) || (dirOffset == kMFSDirectoryBlockIterateFromStart) );
 
        // dirOffset must either be the magic token or a reasonable value.
        
        assert( (dirOffset == kMFSDirectoryBlockIterateFromStart) || (dirOffset < fsmp->fBlockDevBlockSize) );
    }
    
    // Do the bit swizzling.
 
    if (dirOffset == kMFSDirectoryBlockIterateFromStart) {
        uioOffset = (((off_t) dirBlock) << 16) | kStartOfBlockMagicOffset;
    } else {
        assert(dirOffset < 65536);      // the UnpackUIOOffset code asserts this as well, but it never hurts to double check
        uioOffset = (((off_t) dirBlock) << 16) | dirOffset;
    }
    
    // Update the cache.
    
    if (trustDirOffset) {               // we implicitly trust the cache, so don't cache unless dirOffset is trusted
        fsn->fLastDirOffset = (uint32_t) uioOffset;
    }
    
    // Update the UIO's offset.
    
    uio_setoffset(uio, uioOffset);
}
 
static errno_t VNOPReadDir(struct vnop_readdir_args *ap)
    // Called by VFS to iterate the contents of a directory (most notably 
    // by the implementation of <x-man-page://2/getdirentries>).
    //
    // vp is the directory we're iterating.  
    //
    // uio describes the buffer into which we copy the (struct dirent) values 
    // that represent directory entries; it is discussed in detail below.
    //
    // flags contains two options bits, VNODE_READDIR_EXTENDED and 
    // VNODE_READDIR_REQSEEKOFF, neither of which we support (they're only 
    // needed if the file system is to be NFS exported).
    //
    // eofflagPtr, if not NULL, is a place to indicate that we've read the 
    // last directory entry. 
    //
    // numdirententPtr, if not NULL, is a place to return a count of the 
    // number of directory entries that we've returned.
    //
    // context identifies the calling process.
    // 
    // The hardest thing to understand about this entry point is the UIO 
    // management.  There are two tricky aspects:
    //
    // o The UIO offset (accessed via uio_offset and uio_setoffset) 
    //   determines the first directory item read.  This does not have 
    //   to literally be an offset into the directory (such a usage makes 
    //   sense on a UFS-style file system, but it makes no sense for a 
    //   file system, like HFS Plus, which has no obvious directory offset). 
    //   Rather, the semantics are as follows:
    //
    //   - A UIO offset of zero indicates that you should read from the 
    //     start of the directory.
    //
    //   - You are responsible for setting the UIO offset to indicate how 
    //     much you read.
    //
    //   - This offset value can then be passed back to you to continue 
    //     reading at that offset.
    //
    //   So, if you have a file system where you can index directory items, 
    //   it's perfectly reasonable for you to use an index as the UIO offset.
    //   However, there are some gotchas:
    //
    //   - The UIO offset is an off_t, so you might think that you have 64 bits 
    //     to play with.  However, this is truncated down to a long in the 
    //     basep parameter of getdirentries, so you only have 32 bits (because 
    //     a long is 32 bits for 32-bit client processes).
    //
    //   - Furthermore, you only /actually/ have 31 bits, because longs are 
    //     signed, and if you return a negative offset then, if the client 
    //     tries to lseek <x-man-page://2/lseek> to that offset (which is a 
    //     legal usage pattern), lseek will fail (because it arbitrarily 
    //     disallows negative offsets, even for directories).
    //
    //   - Remember that uiomove increments the UIO offset by the number of bytes 
    //     that it copies.  Typically this is not useful behaviour for directories.  
    //     In most cases you will want to explicitly set the UIO offset 
    //     (using uio_setoffset) before you return.
    //
    //   - Because the offset can be set by untrusted programs (using lseek), 
    //     you must be able to safely (that is, without kernel panicking!) 
    //     reject illegal offsets.  If the client calls getdirentries after seeking 
    //     to a bogus offset, you should return EINVAL.
    //
    //   - Depending on your volume format, it may be expensive to verify that 
    //     the offset is valid.  In that case, you may want to cache the last 
    //     offset that you returned in your FSNode.  There are two things to be careful
    //     about here:
    //
    //     - Make sure you invalidate the cache if you do something that changes whether 
    //       an offset is valid.
    //
    //     - Be aware that you may need more than one cache entry, because multiple 
    //       client may be reading the directory simultaneously.  Remember, while 
    //       each client gets their own file descriptor, there's only one FSNode 
    //       for any given on-disk directory.
    //
    // o The UIO resid (residual ID, accessed by uio_resid and uio_setresid) 
    //   indicates how much space is left in the user buffer described by the UIO.  
    //   You must update this as you copy data out into that buffer (fortunately, 
    //   the obvious copying routine, uiomove does this update for you).  The VFS 
    //   layer uses this value to calculate the return value for the 
    //   getdirentries system call.  That is, the return value of 
    //   getdirentries is the original buffer size minus this UIO resid. 
    //   So, if you completely fill the user's buffer (hence resid is 
    //   0), getdirentries will return the original buffer size.  
    //   On the other hand, if you return no data, resid will be equal 
    //   to the buffer size, and getdirentries will return 0 (an indication 
    //   that there are no more items in the directory).
    //
    //   It's also worth noting that there is no guarantee that the 
    //   user's buffer size will be an even multiple of your dirent 
    //   size (in fact, there's no requirement for you to have a 
    //   fixed dirent size).  Thus, even after you've filled the user's 
    //   buffer (you've copied out all of the entries that will fit), 
    //   it's possible for resid to be positive.  Under no circumstances 
    //   should you copy out a partial dirent.
    //
    // o uiomove does not error if it only copies out a part of the data 
    //   that you requested.  You should call uio_resid to ensure that 
    //   there's enough space for the entire dirent before calling uiomove.
    //
    // Make sure you read <x-man-page://5/dirent> for information about 
    // (struct dirent).  Specifically, this page defines constraints on 
    // (struct dirent) to which you must comply.
    // 
    // On success, *eofflagPtr is TRUE if we've returned the last 
    // entry in this directory.  The NFS server uses this information 
    // to tag the reply packet that contains this entry with an EOF 
    // marker; this avoids the need for the client to make another 
    // call to confirm that it has read the entire directory.
    //
    // On success, *numdirentPtr is the number of dirent structures 
    // that we read.
{
    errno_t         err;
    vnode_t         vp;
    struct uio *    uio;
    int             flags;
    int *           eofflagPtr;
    int             eofflag;
    int *           numdirentPtr;
    int             numdirent;
    vfs_context_t   context;
    FSMount *       fsmp;
 
    // Unpack arguments
 
    vp           = ap->a_vp;
    uio          = ap->a_uio;
    flags        = ap->a_flags;
    eofflagPtr   = ap->a_eofflag;
    numdirentPtr = ap->a_numdirent;
    context      = ap->a_context;
 
    // Pre-conditions
    
    assert( ValidVNode(vp) );
    assert(vnode_isdir(vp));            // VFS already checks that vp is a directory
    assert(vnode_isvroot(vp));          // and the only directory we have is the root
    assert(uio != NULL);
    AssertKnownFlags(flags, VNODE_READDIR_EXTENDED | VNODE_READDIR_REQSEEKOFF);
    // assert(eofflag != NULL);     // it's fine for this to be NULL
    // assert(numdirent == NULL);   // this is NULL in the typical case
    assert(context != NULL);
    
    // Implementation
    
    fsmp = FSMountFromMount(vnode_mount(vp));
 
    eofflag = FALSE;
    numdirent = 0;
    
    if ( (flags & VNODE_READDIR_EXTENDED) || (flags & VNODE_READDIR_REQSEEKOFF) ) {
        // We only need to support these flags if we want to support being exported 
        // by NFS.
        err = EINVAL;
    } else {
        uint16_t        dirBlock;
        size_t          dirOffset;
        boolean_t       trustDirOffset;
        struct dirent * dirEntBuf;
        
        // Allocate a dirent buffer that's big enough to hold the maximum possible 
        // name that can be returned by the MFS core.  This makes it possible for 
        // us to use a single buffer for calling MFS core and assembling the 
        // dirent that we're going to copy out to user space.
        
        err = 0;
        dirEntBuf = OSMalloc(offsetof(struct dirent, d_name) + MAXPATHLEN, gOSMallocTag);
        if (dirEntBuf == NULL) {
            err = ENOMEM;
        }
        
        // Unpack uio_offset and check it as best we can.
        
        if (err == 0) {
            err = UnpackUIOOffset(vp, uio, &dirBlock, &dirOffset, &trustDirOffset);
        }
 
        if (err == 0) {
            // We only end up here if UnpackUIOOffset succeeded, which means that we have reasonable 
            // values to pack back into the UIO's offset when we call PackUIOOffset below.  This is 
            // important because we can failue for a variety of reasons from this point in, so we 
            // can't just not update the UIO's offset if we get an error.
            
            // Handle the "." and ".." synthetic items.
            
            if ( (err == 0) && (dirBlock == 0) ) {
                // Because of the way unpacking works, trustDirOffset will be true when 
                // we get here.  That's good because it means that we don't need to set it 
                // as we modify dirOffset.
                
                assert(trustDirOffset);
                
                if (dirOffset == 0) {
                    strcpy(dirEntBuf->d_name, ".");
                    err = CopyOutDirEnt(uio, kMFSRootInodeNumber, DT_DIR, dirEntBuf);
                    if (err == 0) {
                        dirOffset = 1;
                        numdirent += 1;
                    }
                }
                if ( (err == 0) && (dirOffset == 1) ) {
                    strcpy(dirEntBuf->d_name, ".");
                    err = CopyOutDirEnt(uio, kMFSRootInodeNumber, DT_DIR, dirEntBuf);
                    if (err == 0) {
                        dirBlock = fsmp->fDirectoryStartBlock;
                        dirOffset = kMFSDirectoryBlockIterateFromStart;
                        numdirent += 1;
                    }
                }
            }
 
            // Handle the actual MFS directory.
            
            if (err == 0) {
                err = ReadDirectoryAndCopyOutDirEnt(fsmp, &dirBlock, &dirOffset, &trustDirOffset, &numdirent, uio, dirEntBuf);
            }
            
            // We failed because there wasn't enough space in uio.  This is something that the 
            // caller should cope with, so we just swallow the error.
            
            if (err == ENOBUFS) {
                err = 0;
            }
            
            // Update uio_offset.
 
            PackUIOOffset(vp, uio, dirBlock, dirOffset, trustDirOffset);
            
            // Determine if we're at the end of the directory.
            
            eofflag = (dirBlock == (fsmp->fDirectoryStartBlock + fsmp->fDirectoryBlockCount));
        }
 
        if (dirEntBuf != NULL) {
            OSFree(dirEntBuf, offsetof(struct dirent, d_name) + MAXPATHLEN, gOSMallocTag);
        }
    }
 
    // Copy out any information that's requested by the caller.
    
    if (eofflagPtr != NULL) {
        *eofflagPtr = eofflag;
    }
    if (numdirentPtr != NULL) {
        *numdirentPtr = numdirent;
    }
 
    return err;
}
 
static errno_t VNOPReclaim(struct vnop_reclaim_args *ap)
    // Called by VFS to disassociate this vnode from the underlying FSNode.
    // 
    // vp in the vnode to reclaim.
    //
    // context identifies the calling process.
    //
    // This operation should be relatively cheap; it is /not/ the point where, 
    // for example, you should write the FSNode back to disk (rather, you should 
    // do that in your VNOPInactive entry point).
    //
    // IMPORTANT:
    // If VNOPReclaim fails, the system panics.
    //
    // Our implementation is relatively easy because all of the hard stuff is handled 
    // by the hash layer.
{
    vnode_t         vp;
    HNodeRef        hn;
    vfs_context_t   context;
    
    // Unpack arguments
 
    vp           = ap->a_vp;
    context      = ap->a_context;
 
    // Pre-conditions
    
    assert(vp != NULL);
    assert( ValidVNode(vp) );
    assert(context != NULL);
 
    // Do the reclaim
 
    hn = HNodeFromVNode(vp);
    if ( HNodeDetachVNode(hn, vp) ) {
        FSNodeScrub( FSNodeFromHNode(hn) );
        HNodeScrubDone(hn);
    }
 
    return 0;
}
 
static errno_t VNOPOpen(struct vnop_open_args *ap)
    // Called by VFS to open a vnode for access.
    //
    // vp is the vnode that's being opened. 
    // 
    // mode contains the flags passed to open (things like FREAD).
    //
    // context identifies the calling process.
    //
    // This entry is rarely useful because VFS can read a file vnode without ever 
    // opening it, thus any work that you'd usually do here you have to do lazily in 
    // your read/write entry points.
    //
    // Regardless, in our implementation we have nothing to do.
{
    vnode_t         vp;
    int             mode;
    vfs_context_t   context;
 
    // Unpack arguments
    
    vp      = ap->a_vp;
    mode    = ap->a_mode;
    context = ap->a_context;
    
    // Pre-conditions
    
    assert( ValidVNode(vp) );
    AssertKnownFlags(mode, O_EVTONLY | O_NONBLOCK | FREAD | FWRITE);
    assert(context != NULL);
 
    // Empty implementation
    
    // You can open both the root directory and file vnodes, but we do nothing here 
    // in either case.
 
    return 0;
}
 
static errno_t VNOPClose(struct vnop_close_args *ap)
    // Called by VFS to close a vnode for access.
    //
    // vp is the vnode that's being closed. 
    //
    // fflags contains the flags associated with the close (things like FREAD).
    //
    // context identifies the calling process.
    //
    // This entry is not as useful as you might think because a vnode can be accessed 
    // after the last close (if, for example, if has been memory mapped).  In most cases 
    // the work that you might think to do here, you end up doing in VNOPInactive.
    //
    // Regardless, in our implementation we have nothing to do.
{
    vnode_t         vp;
    int             fflag;
    vfs_context_t   context;
 
    // Unpack arguments
 
    vp      = ap->a_vp;
    fflag   = ap->a_fflag;
    context = ap->a_context;
 
    // Pre-conditions
    
    assert( ValidVNode(vp) );
    AssertKnownFlags(fflag, O_EVTONLY | O_NONBLOCK | FREAD | FWRITE);
    assert(context != NULL);
 
    // Empty implementation
    
    return 0;
}
 
static errno_t VNOPMmap(struct vnop_mmap_args *ap)
    // Called by VFS when it memory maps a file.
    //
    // vp is the vnode that's being memory mapped. 
    //
    // fflags contains the flags associated with the mmap (things like PROT_EXEC).
    //
    // context identifies the calling process.
    //
    // We don't have to take any special action here.
    //
    // IMPORTANT
    // You might think that returning an error here will prevent a file from being 
    // mapped.  That's not quite true.  If you want to prevent the file being mapped, 
    // you must return EPERM.  Any other error is ignored.  [Even then, I don't think 
    // it'll actually cause the mmap system call to fail.  Hmmmm.]
{
    vnode_t         vp;
    int             fflags;
    vfs_context_t   context;
 
    // Unpack arguments
 
    vp      = ap->a_vp;
    fflags  = ap->a_fflags;
    context = ap->a_context;
 
    // Pre-conditions
    
    assert( ValidVNode(vp) );
    assert( vnode_isreg(vp) || vnode_ischr(vp) );           // VFS won't try to mmap anything else
    AssertKnownFlags(fflags, PROT_EXEC | PROT_READ | PROT_WRITE);
    assert(context != NULL);
 
    // Empty implementation
    
    return 0;
}
 
static errno_t VNOPMnomap(struct vnop_mnomap_args *ap)
    // Called by VFS when it unmaps a file.
    //
    // vp is the vnode that's being unmapped. 
    //
    // context identifies the calling process.
    //
    // We don't have to take any special action here.
    //
    // IMPORTANT
    // VFS ignores the result of this function.  You can't prevent an unmap.
{
    vnode_t         vp;
    vfs_context_t   context;
 
    // Unpack arguments
 
    vp      = ap->a_vp;
    context = ap->a_context;
 
    // Pre-conditions
    
    assert( ValidVNode(vp) );
    assert( vnode_isreg(vp) || vnode_ischr(vp) );           // nothing else should be mmapped
    assert(context != NULL);
 
    // Empty implementation
    
    return 0;
}
 
static errno_t VNOPRead(struct vnop_read_args *ap)
    // Called by VFS to read a file.
    //
    // vp is the vnode that's being read.
    //
    // uio describes the offset in the file to read and the destination buffer.
    //
    // ioflag contains the flags associated with the read (things like IO_NDELAY).
    //
    // context identifies the calling process.
{
    int             err;
    vnode_t         vp;
    uio_t           uio;
    int             ioflag;
    vfs_context_t   context;
    FSNode *        fsn;
    size_t          forkIndex;
    
    // Unpack arguments
 
    vp           = ap->a_vp;
    uio          = ap->a_uio;
    ioflag       = ap->a_ioflag;
    context      = ap->a_context;
 
    // Pre-conditions
 
    assert( ValidVNode(vp) );
    assert(uio != NULL);
    AssertKnownFlags(ioflag, IO_NDELAY | IO_SYNC | IO_APPEND | IO_UNIT | IO_NODELOCKED);
    assert(context != NULL);
 
    // Implementation -- We just pass the request off to the cluster layer and have it 
    // do all of the heavy lifting.
    
    fsn = FSNodeFromVNode(vp);
    forkIndex = HNodeGetForkIndexForVNode(vp);
    assert(forkIndex <= 1);
    
    if ( vnode_isreg(vp) ) {
        err = cluster_read(vp, uio, fsn->fForkInfo[forkIndex].lengthInBytes, 0);
    } else if ( vnode_isdir(vp) ) {
        err = EISDIR;
    } else {
        err = EPERM;
    }
 
    return err;
}
 
static errno_t VNOPPagein(struct vnop_pagein_args *ap)
    // Called by VFS to handle a virtual memory pagein operation.
    //
    // vp is the vnode that's being paged in.
    //
    // pl is the universal page list that describes the pages to be read.
    //
    // pl_offset is the offset within that page list.
    //
    // f_offset is the offset within the file.
    //
    // flags contains the flags associated with the pagein (things like UPL_IOSYNC).
    //
    // context identifies the calling process.
{
    int             err;
    vnode_t         vp;
    upl_t           pl;
    vm_offset_t     pl_offset;
    off_t           f_offset;
    size_t          size;
    int             flags;
    vfs_context_t   context;
    FSNode *        fsn;
    size_t          forkIndex;
 
    // Unpack arguments
 
    vp        = ap->a_vp;
    pl        = ap->a_pl;
    pl_offset = ap->a_pl_offset;
    f_offset  = ap->a_f_offset;
    size      = ap->a_size;
    flags     = ap->a_flags;
    context   = ap->a_context;
 
    // Pre-conditions
 
    assert( ValidVNode(vp) );
    assert( vnode_isreg(vp) || vnode_ischr(vp) );           // nothing else should be mmapped
    assert(pl != NULL);
    AssertKnownFlags(flags, UPL_IOSYNC | UPL_NOCOMMIT | UPL_NORDAHEAD);     // other flags, like UPL_MSYNC, should only be for pageout
    assert(context != NULL);
 
    // Implementation -- We just pass the request off to the cluster layer and have it 
    // do all of the heavy lifting.
 
    fsn = FSNodeFromVNode(vp);
    forkIndex = HNodeGetForkIndexForVNode(vp);
    assert(forkIndex <= 1);
 
    err = cluster_pagein(vp, pl, pl_offset, f_offset, size, fsn->fForkInfo[forkIndex].lengthInBytes, flags);
    
    return err;
}
 
static errno_t VNOPBlktooff(struct vnop_blktooff_args *ap)
    // Called by the cluster layer to map a block offset within a file to 
    // the corresponding byte offset.
    //
    // vp is the vnode whose block offset is being queried.
    //
    // lblkno is the block number.
    //
    // offsetPtr is a pointer to a file offset.  On succes, we must set this 
    // to the file offset that corresponds to the specified block offset.
    //
    // Note that there is no context parameter.
{
    vnode_t     vp;
    daddr64_t   lblkno;
    off_t *     offsetPtr;
    FSMount *   fsmp;
 
    // Unpack arguments
 
    vp        = ap->a_vp;
    lblkno    = ap->a_lblkno;
    offsetPtr = ap->a_offset;
    
    // Pre-conditions
 
    assert( ValidVNode(vp) );
    assert(vnode_isreg(vp));
    assert(offsetPtr != NULL);
    
    // Implementation -- This is trivial because we have a single fixed 
    // allocation block size.
    
    fsmp = FSMountFromMount(vnode_mount(vp));
    
    *offsetPtr = lblkno * fsmp->fAllocationBlockSizeInBytes;
    
    return 0;
}
 
static errno_t VNOPOfftoblk(struct vnop_offtoblk_args *ap)
    // Called by the cluster layer to map a file offset to the corresponding 
    // block number.
    //
    // vp is the vnode whose file offset is being queried.
    //
    // offset is the file offset in bytes.
    //
    // lblknoPtr is a pointer to a block number.  On succes, we must set this 
    // to the block number that corresponds to the specified file offset.
    //
    // Note that there is no context parameter.
{
    vnode_t     vp;
    off_t       offset;
    daddr64_t * lblknoPtr;
    FSMount *   fsmp;
 
    // Unpack arguments
 
    vp        = ap->a_vp;
    offset    = ap->a_offset;
    lblknoPtr = ap->a_lblkno;
    
    // Pre-conditions
 
    assert( ValidVNode(vp) );
    assert(vnode_isreg(vp));
    assert(lblknoPtr != NULL);
    
    // Implementation -- This is trivial because we have a single fixed 
    // allocation block size.
    
    fsmp = FSMountFromMount(vnode_mount(vp));
    
    *lblknoPtr = offset / fsmp->fAllocationBlockSizeInBytes;
    
    return 0;
}
 
/*
    VNOPBlockmap Pre- and Post-Conditions
    -------------------------------------
    On entry, ap->a_foffset is an even multiple of the device block size
 
    On entry, ap->a_foffset is not necessarily an even multiple of the fork's logical block size (that is, 
    the value that I multiply and divide by in the VNOPBlktooff/VNOPOfftoblk)
 
    On entry, ap->a_foffset is not necessarily an even multiple of the page size (although it is on 10.4.x)
 
    On entry, ap->a_foffset is greater than or equal to zero
 
    On entry, ap->a_foffset is strictly less than the logical fork length
 
    On entry, ap->a_foffset is strictly less than the physical fork length
 
    On entry, ap->a_size is an even multiple of the device block size
 
    On entry, ap->a_size is not necessarily an even multiple of the fork's logical block size
 
    On entry, ap->a_size is not necessarily an even multiple of the page size
 
    On entry, ap->a_size is non-zero
 
    On entry, ap->a_foffset + ap->a_size is less than or equal to the physical fork length (not its 
    logical length) rounded up to the next page
 
    On error, *ap->a_bpn is ignored
 
    On success, *ap->a_bpn must be -1 if ap->f_foffset falls into a hole in a sparse fork
 
    On success, if ap->f_foffset does not fall into a whole in a sparse file, *ap->a_bpn must be the 
    device block number that contains the byte at ap->f_foffset in the fork
 
    On error, *ap->a_run is ignored
 
    On success, *ap->a_run must not be 0
 
    On success, *ap->a_run must be less than or equal to ap->a_size
 
    On success, *ap->a_run must be an even multiple of the device block size
 
    On success, *ap->a_run is not necessarily an even multiple of the fork's logical block size
 
    On success, *ap->a_run is bounded by the fork's physical length (not its logical size)
*/
 
static errno_t VNOPBlockmap(struct vnop_blockmap_args *ap)
    // Called by the cluster layer to map a file offset to its corresponding 
    // disk block number, and to return the number of physically contiguous 
    // bytes of the file that start at the block.
    //
    // vp is the vnode whose mapping is being queried.
    //
    // foffset is the offset within the file, in bytes, that's of interest.
    //
    // size is the size, in bytes, that is being queried.  That is, the caller 
    // is interested in knowing the mapping of the bytes that start at 
    // foffset and end at foffset + size - 1.
    //
    // bpnPtr is a pointer to a block number.  On success, you should set this 
    // to the block that contains the byte at foffset within the file.
    //
    // runPtr is a pointer to byte count.  On success, you should set this to 
    // the number of physically contiguous bytes of the file that start at 
    // foffset within the file.
    //
    // poffPtr is always NULL and is not meaningful; if it's not NULL, you should 
    // set *poffPtr to NULL on success.
    //
    // flags contains the flags associated with the blockmap (things like VNODE_READ).
    //
    // context identifies the calling process.  Because this is called out of the 
    // cluster layer, it is typically NULL.
{
    int             err;
    vnode_t         vp;
    off_t           foffset;
    size_t          size;
    daddr64_t *     bpnPtr;
    size_t *        runPtr;
    int *           poffPtr;
    int             flags;
    vfs_context_t   context;
    FSMount *       fsmp;
    FSNode *        fsn;
    size_t          forkIndex;
    off_t           offsetWithinAllocationBlock;
    uint32_t        offsetFromFirstAllocationBlockInBytes;
    uint32_t        contiguousPhysicalBytes;
 
    // Unpack arguments
 
    vp      = ap->a_vp;
    foffset = ap->a_foffset;
    size    = ap->a_size;
    bpnPtr  = ap->a_bpn;
    runPtr  = ap->a_run;
    poffPtr = (int *) ap->a_poff;
    flags   = ap->a_flags;
    context = ap->a_context;
 
    // Pre-conditions (first round)
 
    assert(ValidVNode(vp));
    assert(foffset >= 0);
    assert(size > 0);
    assert(bpnPtr != NULL);
    assert(runPtr != NULL);
    // poff may be NULL; in fact, it's always NULL right now
    AssertKnownFlags(flags, VNODE_READ | VNODE_WRITE);
    // context is typically NULL
 
    // Extract file system specific data from the VFS data.
 
    fsmp = FSMountFromMount(vnode_mount(vp));
    fsn  = FSNodeFromVNode(vp);
    forkIndex = HNodeGetForkIndexForVNode(vp);
    assert(forkIndex <= 1);
    
    // Pre-conditions (second round) -- These are dependent on the file system specific data, 
    // so we check them after we've extracted that from the VFS data.
    
    assert( (foffset % fsmp->fBlockDevBlockSize) == 0 );
    assert(foffset < fsn->fForkInfo[forkIndex].lengthInBytes);
    assert(foffset < fsn->fForkInfo[forkIndex].physicalLengthInBytes);
 
    assert( (size % fsmp->fBlockDevBlockSize) == 0 );
 
    assert((foffset + size) <= ((fsn->fForkInfo[forkIndex].physicalLengthInBytes + (PAGE_SIZE - 1)) / PAGE_SIZE * PAGE_SIZE));
 
    // Implementation -- The bulk of the work is done by the MFS core.  The tricky part is 
    // adapting the VFS pre-conditions to the MFSCore pre-conditions.  Specifically, VFS 
    // specifies that foffset is an even multiple of the device block size, but the equivalent 
    // value in MFSCore must be an even multiple of the allocation block size.
    
    offsetWithinAllocationBlock = foffset % fsmp->fAllocationBlockSizeInBytes;
    err = MFSForkGetExtent(
        fsmp->fMDBVABM, 
        &fsn->fForkInfo[forkIndex], 
        foffset - offsetWithinAllocationBlock, 
        &offsetFromFirstAllocationBlockInBytes, 
        &contiguousPhysicalBytes
    );
    if (err == 0) {
        *bpnPtr = fsmp->fAllocationBlocksStartBlock + (offsetFromFirstAllocationBlockInBytes / fsmp->fBlockDevBlockSize);
        
        // Reduce the physically contiguous bytes to account for the fact that foffset 
        // can fall at a device block boundary within the allocation block.
        
        assert(contiguousPhysicalBytes >= offsetWithinAllocationBlock);
        contiguousPhysicalBytes -= offsetWithinAllocationBlock;
        
        // Now reduce it again to bound it by size, which is a requirement of the post-condition.
        
        if (contiguousPhysicalBytes > size) {
            contiguousPhysicalBytes = size;
        }
 
        *runPtr = contiguousPhysicalBytes;
 
        if (poffPtr != NULL) {
            *poffPtr = 0;
        }
    }
    
    // Post-conditions -- These only apply in the success case.  Rather than add a 
    // error check into each assert, I do the asserts within a "if" statement.
    
    if (err == 0) {
        assert(*bpnPtr != -1);          // we don't support sparse files
        assert(*runPtr != 0);
        assert(*runPtr <= size);
        assert( (*runPtr % fsmp->fBlockDevBlockSize) == 0 );
        assert((foffset + *runPtr) <= fsn->fForkInfo[forkIndex].physicalLengthInBytes);
    }
    
    return err;
}
 
static errno_t VNOPStrategy(struct vnop_strategy_args *ap)
    // Called by the cluster layer (and the bio layer) to kick off a buffer 
    // I/O.  Our implementation just calls through to buf_strategy (which 
    // is typical).
{
    int         err;
    buf_t       bp;
    vnode_t     vn;
    FSMount *   fsmp;
    
    // Unpack arguments
 
    bp = ap->a_bp;
 
    // Pre-conditions
    
    assert(bp != NULL);
    
    // Implementation
    
    vn = buf_vnode(bp);
    assert(vn != NULL);
    fsmp = FSMountFromMount(vnode_mount(vn));
    
    err = buf_strategy(fsmp->fBlockDevVNode, ap);
    
    return err;
}
 
static errno_t CopyOutExtendedAttributeName(uio_t uio, const char *xattrName, size_t *sizePtr)
    // Called by VNOPListxattr to copy an extended attribute name, 
    // in xattrName, to a user buffer (if uio is not NULL), or to 
    // increment *sizePtr by the size of the name that would be copied 
    // out (if uio is NULL).
{
    int     err;
    size_t  xattrNameLen;
    
    // uio may be NULL
    assert(xattrName != NULL);
    assert(sizePtr != NULL);
    
    xattrNameLen = strlen(xattrName) + 1;           // we want to copy out the null terminator
    if (uio == NULL) {
        *sizePtr += xattrNameLen;
        err = 0;
    } else if (uio_resid(uio) < xattrNameLen) {
        err = ERANGE;
    } else {
        err = uiomove( (caddr_t) xattrName, xattrNameLen, uio );
    }
    return err;
}
 
// IMPORTANT:
// As far as MFS is concerned, Finder info is 16 bytes long.  The extra 16 bytes 
// of extended Finder info were added with HFS.
 
static const uint8_t kEmptyFinderInfo[16] = { 0 };
 
static errno_t VNOPListxattr(struct vnop_listxattr_args *ap)
    // Called by VFS to list the extended attributes of a vnode.
    // 
    // vp is the vnode for which we're listing the extended attributes.
    //
    // uio is the buffer to which the list of extended attributes names 
    // should be copied; uio may be NULL, in which case the caller is just 
    // interested in the size that the buffer should be (returned in *sizePtr).
    // Attributes names are copied to the buffer, one after the other, each 
    // terminated by a null character.
    //
    // sizePtr is the size of the attributes that have (or would be) copied 
    // to the buffer.
    //
    // options contains the flags associated with the operation (things like XATTR_NOSECURITY). 
    // All of the existing flags are intepreted by VFS, and you don't need to look at them.
    //
    // context identifies the calling process.
{
    int             err;
    vnode_t         vp;
    uio_t           uio;
    size_t *        sizePtr;
    int             options;
    vfs_context_t   context;
    FSMount *       fsmp;
    FSNode *        fsn;
    uint8_t         finderInfo[16];
 
    // Unpack arguments
 
    vp      = ap->a_vp;
    uio     = ap->a_uio;
    sizePtr = ap->a_size;
    options = ap->a_options;
    context = ap->a_context;
 
    // Pre-conditions
 
    assert(ValidVNode(vp));
    // uio is allowed to be NULL, indicating that the caller just wants the size
    assert(sizePtr != NULL);
    AssertKnownFlags(options, XATTR_NOFOLLOW | XATTR_NOSECURITY);
    assert(context != NULL);
 
    // Implementation
 
    fsmp = FSMountFromMount(vnode_mount(vp));
    fsn = FSNodeFromVNode(vp);
 
    *sizePtr = 0;
    
    // Can't work with extended attributes on resource fork vnodes.
    
    err = 0;
    if ( HNodeGetForkIndexForVNode(vp) != 0 ) {
        err = EPERM;
    }
 
    // In MFS, extended attributes can only exist on files.
    
    if ( (err == 0) && vnode_isreg(vp) ) {
 
        // If the Finder info isn't empty, list it as an extended attribute.
        
        err = FSNodeGetFinderInfo(fsmp, fsn, finderInfo);
        if ( (err == 0) && (memcmp(finderInfo, kEmptyFinderInfo, sizeof(kEmptyFinderInfo)) != 0) ) {
            err = CopyOutExtendedAttributeName(uio, XATTR_FINDERINFO_NAME, sizePtr);
        }
 
        // If the resource fork isn't empty, list it as an extended attribute.
        if ( (err == 0) && (fsn->fForkInfo[1].lengthInBytes != 0) ) {
            err = CopyOutExtendedAttributeName(uio, XATTR_RESOURCEFORK_NAME, sizePtr);
        }
    }
 
    return err;
}
 
static errno_t CopyOutExtendedAttribute(uio_t uio, void *xattr, size_t xattrSize, size_t *sizePtr)
    // Called by VNOPGetxattr to copy an extended attribute, specified by 
    // xattr and xattrSize, to uio (if uio is not NULL), or to 
    // increment *sizePtr by the size of the atttribute that would be copied 
    // out (if uio is NULL).
{
    int     err;
    
    // uio may be NULL
    assert(xattr != NULL);
    assert(xattrSize != 0);
    assert(sizePtr != NULL);
 
    if (uio == NULL) {
        *sizePtr = xattrSize;
        err = 0;
    } else if (uio_resid(uio) < xattrSize) {
        err = ERANGE;
    } else {
        err = uiomove( (caddr_t) xattr, xattrSize, uio );
    }
 
    return err;
}
 
static errno_t VNOPGetxattr(struct vnop_getxattr_args *ap)
    // Called by VFS to get the extended attributes of a vnode.
    // 
    // vp is the vnode for which we're getting an extended attribute.
    //
    // name is the name of the extended attribute to get.
    //
    // uio is the buffer to which the extended attribute should be copied;
    // uio may be NULL, in which case the caller is just interested in the 
    // size the attribute (returned in *sizePtr).
    //
    // sizePtr is the size of the attributes that have (or would be) copied 
    // to the buffer.
    //
    // options contains the flags associated with the operation (things like XATTR_NOSECURITY). 
    // All of the existing flags are intepreted by VFS, and you don't need to look at them.
    //
    // context identifies the calling process.
{
    int             err;
    int             junk;
    vnode_t         vp;
    const char *    name;
    uio_t           uio;
    size_t *        sizePtr;
    int             options;
    vfs_context_t   context;
    FSMount *       fsmp;
    FSNode *        fsn;
    uint8_t         finderInfo[32];
 
    // Unpack arguments
 
    vp      = ap->a_vp;
    name    = ap->a_name;
    uio     = ap->a_uio;
    sizePtr = ap->a_size;
    options = ap->a_options;
    context = ap->a_context;
 
    // Pre-conditions
 
    assert(ValidVNode(vp));
    assert(name != NULL);
    // uio is allowed to be NULL, indicating that the caller just wants the size
    assert(sizePtr != NULL);
    AssertKnownFlags(options, XATTR_NOFOLLOW | XATTR_NOSECURITY);
    assert(context != NULL);
    
    // Implementation
 
    fsmp = FSMountFromMount(vnode_mount(vp));
    fsn = FSNodeFromVNode(vp);
    
    // Can't work with extended attributes on resource fork vnodes.
    
    err = 0;
    if ( HNodeGetForkIndexForVNode(vp) != 0 ) {
        err = EPERM;
    }
    
    // Non-regular files (in the case of MFS, this means the root directory vnode) 
    // have no extended attributes.
    
    if ( (err == 0) && ! vnode_isreg(vp) ) {
        err = ENOATTR;
    }
 
    if (err == 0) {
        if (strcmp(name, XATTR_FINDERINFO_NAME) == 0) {
            
            // Return the Finder info (if it's not empty).
            
            err = FSNodeGetFinderInfo(fsmp, fsn, finderInfo);
            if (err == 0) {
                if (memcmp(finderInfo, kEmptyFinderInfo, sizeof(kEmptyFinderInfo)) != 0) {
                    memset(&finderInfo[16], 0, 16);         // clear the extended Finder info
                    
                    err = CopyOutExtendedAttribute(uio, finderInfo, sizeof(finderInfo), sizePtr);
                } else {
                    err = ENOATTR;
                }
            }
        } else if (strcmp(name, XATTR_RESOURCEFORK_NAME) == 0) {
 
            // Return the resource fork (if it's not empty).
 
            if (fsn->fForkInfo[1].lengthInBytes == 0) {
                err = ENOATTR;
            } else {
                if (uio == NULL) {
                    *sizePtr = fsn->fForkInfo[1].lengthInBytes;
                } else {
                    vnode_t rsrcVN;
                    
                    rsrcVN = NULL;
                    
                    err = FSNodeGetOrCreateVNodeByID(fsmp, HNodeGetInodeNumber(HNodeFromVNode(vp)), 1, &rsrcVN);
                    
                    if (err == 0) {
                        err = VNOP_READ(rsrcVN, uio, 0, context);
                    }
                    
                    if (rsrcVN != NULL) {
                        junk = vnode_put(rsrcVN);
                        assert(junk == 0);
                    }
                }
            }
        } else {
            err = ENOATTR;
        }
    }
    
    return err;
}
 
/////////////////////////////////////////////////////////////////////
#pragma mark ***** VFS Operations
 
static errno_t FSMountParseArguments(FSMount *fsmp, user_addr_t data)
    // This routine copies in the mount arguments from user space 
    // and applies them to the FSMount.  It is called as part of 
    // VFSOPMount.
{
    int                 err;
    MFSLivesMountArgs   args;
    
    assert( ValidFSMount(fsmp) );
    // assert(data != USER_ADDR_NULL);          // as log as the copyin works, data could be NULL for all I care
 
    // Copy in the mount arguments from user space.
    
    err = copyin(data, &args, sizeof(MFSLivesMountArgs));
    if (err == 0) {
        if ( args.fMagic != kMFSLivesMountArgsMagic ) {
            err = EINVAL;
        }
    }
    
    // Fill in the fields of the FSMount from the mount arguments.
    
    if (err == 0) {
        fsmp->fForceMount   = (args.fForceMount   != 0);
        fsmp->fForceFailure = (args.fForceFailure != 0);
    }
    
    return err;
}
 
static errno_t FSMountConnectDevNode(FSMount *fsmp, vnode_t devvp, vfs_context_t context)
    // This routine 'connect's the FSMount to the block device node that it's mounted on. 
    // This vnode has already been opened for us by VFS, so the only thing we really need 
    // to do is to some ioctls.
    //
    // This is called as part of VFSOPMount.
{
    int     err;
    
    assert( ValidFSMount(fsmp) );
    assert(devvp   != NULL);
    assert(context != NULL);
 
    err = VNOP_IOCTL(devvp, DKIOCGETBLOCKSIZE, (caddr_t) &fsmp->fBlockDevBlockSize, 0, context);
    
    if ( (err == 0) && (fsmp->fBlockDevBlockSize != 512) && ! fsmp->fForceMount ) {
        printf("MFSLives:FSMountConnectDevNode: Mounting on non-512 byte block devices has not been tested; specify the -f mount flag to mount anyway.\n");
        err = EINVAL;
    }
    
    if (err == 0) {
        err = VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t) &fsmp->fBlockDevBlockCount, 0, context);
    }
    
    // We don't really need to take a use count reference to the device vnode 
    // because the system has done this for us.  However, it doesn't hurt and it 
    // panders to my paranoia.
    
    // Also note that a non-NULL fBlockDevVNode field indicates that we 
    // succcessfully took a reference, and that we have to free it up on clean up.
    
    if (err == 0) {
        err = vnode_ref(devvp);
    }
    if (err == 0) {
        fsmp->fBlockDevVNode = devvp;
        fsmp->fBlockRDevNum  = vnode_specrdev(devvp);
    }
    
    return err;
}
 
static errno_t FSMountSetupMFSCore(FSMount *fsmp, vfs_context_t context)
    // This routine calls the MFS core to see if it's happy to mount this volume 
    // and to get important volume parameters.  It also reads the MFS MDB/VABM 
    // in memory for subsequent use by various other parts of this code.
    // 
    // It is called as part of VFSOPMount.
{
    int     err;
    buf_t   buf;
    
    assert( ValidFSMount(fsmp) );
    assert(fsmp->fMDBVABM == NULL);
    assert(context != NULL);
 
    // First, read the MDB and use it to a) check that the volume is remotely valid, and 
    // b) get important information about the volume, including the size of the MDB/VABM.
    
    buf = NULL;
    
    err = buf_meta_bread(fsmp->fBlockDevVNode, kMFSMDBBlock, fsmp->fBlockDevBlockSize, NULL, &buf);
    if (err == 0) {
        const void *    bufData;
        
        bufData = (const void *) buf_dataptr(buf);
        assert(bufData != NULL);
        
        err = MFSMDBCheck(
            bufData, 
            fsmp->fBlockDevBlockCount,
            &fsmp->fMDBAndVABMSizeInBytes, 
            &fsmp->fDirectoryStartBlock,
            &fsmp->fDirectoryBlockCount,
            &fsmp->fAllocationBlocksStartBlock,
            &fsmp->fAllocationBlockSizeInBytes
        );
        if (err == 0) {
            assert( (fsmp->fAllocationBlockSizeInBytes % fsmp->fBlockDevBlockSize) == 0 );
        }
    }
    if (buf != NULL) {
        buf_brelse(buf);
    }
    
    // Now allocate a buffer for the combined MDB/VABM for use at runtime.
    
    if (err == 0) {
        
        // Round fMDBAndVABMSizeInBytes up to the block size.  I do this because we're going to 
        // read it in from fsmp->fBlockDevVNode, which is a raw block device, and those want 
        // block-aligned I/O.
        
        fsmp->fMDBAndVABMSizeInBytes = (fsmp->fMDBAndVABMSizeInBytes + fsmp->fBlockDevBlockSize - 1) / fsmp->fBlockDevBlockSize * fsmp->fBlockDevBlockSize;
        
        // Allocate the buffer.
        
        fsmp->fMDBVABM = OSMalloc(fsmp->fMDBAndVABMSizeInBytes, gOSMallocTag);
        if (fsmp->fMDBVABM == NULL) {
            err = ENOMEM;
        }
    }
 
    // And read the MDB/VABM into that buffer.
    
    if (err == 0) {
        uio_t   uio;
        
        uio = uio_create(1, fsmp->fBlockDevBlockSize * kMFSMDBBlock, UIO_SYSSPACE, UIO_READ);
        if (uio == NULL) {
            err = ENOMEM;
        }
 
        if (err == 0) {
            err = uio_addiov(uio, CAST_USER_ADDR_T(fsmp->fMDBVABM), fsmp->fMDBAndVABMSizeInBytes);
        }
        
        if (err == 0) {
            err = VNOP_READ(fsmp->fBlockDevVNode, uio, 0, context);
        }
        
        if (uio != NULL) {
            uio_free(uio);
        }
    }
    
    return err;
}
 
static errno_t FSMountSetupVFS(FSMount *fsmp)
    // This routine is connects the volume to VFS.  That is, it does all of the 
    // VFS specific stuff that has to be done before we're finished mounting. 
    // It is called right at the end of VFSOPMount.
{
    int                 err;
    struct vfs_attr     attr;
    struct vfsstatfs *  sbp;
    fsid_t              fsid;
 
    assert( ValidFSMount(fsmp) );
    assert(fsmp->fMDBVABM != NULL);             // these fields of fsmp must be set up before we get here
    assert(fsmp->fBlockRDevNum != 0);
    assert(fsmp->fMountPoint != NULL);
    
    // Set up the statfs information.  You can get a pointer to the vfsstatfs 
    // that you need to fill out by calling vfs_statfs.  Before calling your 
    // mount entry point, VFS has already zeroed the entire structure and set 
    // up f_fstypename, f_mntonname, f_mntfromname (if VFC_VFSLOCALARGS was set; 
    // in the other case VFS doesn't know this information and you have to set it 
    // yourself), and f_owner.  You are responsible for filling out the other fields 
    // (except f_reserved1, f_type, and f_flags, which are reserved).  You can also 
    // override VFS's settings if need be.
    // 
    // IMPORTANT:
    // It is vital that you fill out all of these fields (especially the 
    // f_bsize, f_bfree, and f_bavail fields) before returning from VFSOPMount.
    // If you don't, higher-level system components (such as File Manager) can 
    // get very confused.  Specifically, File Manager can get and /cache/ these 
    // values before VFSOPGetattr is ever called.  So you can't rely on a call to 
    // VFSOPGetattr to set up these fields for the first time.
 
    // Call the MFS core to get the various attributes we need.
    
    VFSATTR_INIT(&attr);
    VFSATTR_WANTED(&attr, f_bsize);
    VFSATTR_WANTED(&attr, f_iosize);
    VFSATTR_WANTED(&attr, f_blocks);
    VFSATTR_WANTED(&attr, f_bfree);
    VFSATTR_WANTED(&attr, f_bavail);
    VFSATTR_WANTED(&attr, f_bused);
    VFSATTR_WANTED(&attr, f_files);
    VFSATTR_WANTED(&attr, f_ffree);
 
    err = MFSMDBGetAttr(fsmp->fMDBVABM, &attr);
    if (err == 0) {
        
        // Copy those attributes out to VFS's buffer.
        
        sbp = vfs_statfs(fsmp->fMountPoint);
        assert(sbp != NULL);
        assert( strcmp(sbp->f_fstypename, "MFSLives") == 0 );
        
        assert( VFSATTR_IS_SUPPORTED(&attr, f_bsize) );
        sbp->f_bsize  = attr.f_bsize;
        assert( VFSATTR_IS_SUPPORTED(&attr, f_iosize) );
        sbp->f_iosize = attr.f_iosize;
        assert( VFSATTR_IS_SUPPORTED(&attr, f_blocks) );
        sbp->f_blocks = attr.f_blocks;
        assert( VFSATTR_IS_SUPPORTED(&attr, f_bfree) );
        sbp->f_bfree  = attr.f_bfree;
        assert( VFSATTR_IS_SUPPORTED(&attr, f_bavail) );
        sbp->f_bavail = attr.f_bavail;
        assert( VFSATTR_IS_SUPPORTED(&attr, f_bused) );
        sbp->f_bused  = attr.f_bused;
        assert( VFSATTR_IS_SUPPORTED(&attr, f_files) );
        sbp->f_files  = attr.f_files;
        assert( VFSATTR_IS_SUPPORTED(&attr, f_ffree) );
        sbp->f_ffree  = attr.f_ffree;
 
        // MFS core doesn't return fsid, so we have to cook it up ourselves.
        
        fsid.val[0] = fsmp->fBlockRDevNum;
        fsid.val[1] = vfs_typenum(fsmp->fMountPoint);
        sbp->f_fsid   = attr.f_fsid;
 
        // The situation with sbp->f_owner is complex.  VFS sets this to be the EUID of 
        // the user who called <x-man-page://2/mount>.   Even if we wanted to override it 
        // at this point, the only thing that would fix is the value returned by statfs.  
        // Internally, VFS uses an owner field in the mount_t (mnt_fsowner) that you can't 
        // set using any KPI (the vfs_setowner routine is actually a macro!).  If you need 
        // to control this field, I suggest you do from your mount tool (by setting the EUID 
        // before calling mount).
        //
        // The only noticeable oddity is that the f_owner field returned by <x-man-page://2/statfs> 
        // is always 0.  This is because my disk is being mounted by DiskArb calling my mount tool
        // (mount_MFSLives), and DiskArb runs as root.  This behaviour is shared by various Apple 
        // VFS plug-ins (notably, "msdosfs") so I'm not going to worry about it.  Fortunately, 
        // because MNT_IGNORE_OWNERSHIP is set, this doesn't cause any real problems.  
        //
        // Finally, even if I wanted to change the owner, it's not easy to do because the 
        // mount_MFSLives tool is run with both EUID and RUID set to 0.  I'd have to do 
        // something scary in the mount tool to get the actual user who did the mount.
    }
 
    // Finally, enable the various mount flags.  Normally these are set up by 
    // the mount tool, but in this case we know enough about our capabilities 
    // to force certain flags to certain states.
    
    if (err == 0) {
        vfs_setflags(fsmp->fMountPoint, 0
            | MNT_RDONLY
//          | MNT_SYNCHRONOUS   
            | MNT_NOEXEC
            | MNT_NOSUID
            | MNT_NODEV
//          | MNT_UNION
//          | MNT_ASYNC
//          | MNT_DONTBROWSE    
            | MNT_IGNORE_OWNERSHIP
//          | MNT_AUTOMOUNTED 
//          | MNT_JOURNALED   
//          | MNT_NOUSERXATTR   
//          | MNT_DEFWRITE  
//          | MNT_EXPORTED  
//          | MNT_LOCAL
//          | MNT_QUOTA
//          | MNT_ROOTFS
            | MNT_DOVOLFS
        );
    }
 
    // I'd like to call vfs_setlocklocal here (to tell VFS that it can take care of advisory 
    // locking for us).  However, it's not exported by the BSD KPI <rdar://problem/4641321>.
    
    if (err == 0) {
        // vfs_setlocklocal(fsmp->fMountPoint);
    }
 
    // AFAICT you don't need to call vnode_setmountedon because the system does it for you.
        
    return err;
}
 
static errno_t VFSOPUnmount(mount_t mp, int mntflags, vfs_context_t context);
    // forward declaration
 
static errno_t VFSOPMount(mount_t mp, vnode_t devvp, user_addr_t data, vfs_context_t context)
    // Called by VFS to mount an instance of our file system.
    //
    // mp is a reference to the kernel structure tracking this instance of the 
    // file system.
    //
    // devvp is either:
    //   o an open vnode for the block device on which we're mounted, or 
    //   o NULL
    // depending on the VFS_TBLLOCALVOL flag in the vfe_flags field of the vfs_fsentry 
    // that we registered.  In the former case, the first field of our file system specific 
    // mount arguments must be a pointer to a C string holding the UTF-8 path to the block 
    // device node.
    //
    // data is a pointer to our file system specific mount arguments in the address 
    // space of the current process (the one that called mount).  This is a parameter 
    // block passed to us by our mount tool telling us what to mount and how.  Because 
    // VFS_TBLLOCALVOL is set, the first field of this structure must be pointer to the 
    // path of the block device node; the kernel interprets this parameter, opening up 
    // the node for us.
    //
    // IMPORTANT:
    // If VFS_TBLLOCALVOL is set, the first field of the file system specific mount 
    // parameters is interpreted by the kernel AND THE KERNEL INCREMENTS data TO POINT 
    // TO THE FIELD AFTER THE PATH.  We handle this by defining our mount parameter 
    // structure (MFSLivesMountArgs) in two ways: for user space code, the first field 
    // (fDevNodePath) is a poiner to the block device node path; for kernel code, we omit 
    // this field.
    //
    // IMPORTANT:
    // If your file system claims to be 64-bit ready (VFS_TBL64BITREADY is set), you must 
    // be prepared to handle mount requests from both 32- and 64-bit processes.  Thus, 
    // your file system specific mount parameters must be either 32/64-bit invariant 
    // (as is the case for this example), or you must intepret them differently depending 
    // on the type of process you're being called by (see proc_is64bit from <sys/proc.h>).
    //
    // context identifies the calling process.
{
    int             err;
    int             junk;
    FSMount *       fsmp;
    
    // Pre-conditions
 
    assert(mp != NULL);
    assert(devvp != NULL);
    assert(data != 0);
    assert(context != NULL);
    
    // Implementation
    
    fsmp = NULL;
    
    // This example does not support updating a volume's state (for example, 
    // upgrading it from read-only to read/write).
 
    err = 0;
    if ( vfs_isupdate(mp) ) {
        err = ENOTSUP;
    }
 
    // Allocate the FSMount structure and connect it to the mount point.
    
    if (err == 0) {
        fsmp = OSMalloc(sizeof(*fsmp), gOSMallocTag);
        if (fsmp == NULL) {
            err = ENOMEM;
        } else {
            memset(fsmp, 0, sizeof(*fsmp));
            fsmp->fMagic = kFSMountMagic;
            
            fsmp->fMountPoint = mp;
            vfs_setfsprivate(mp, fsmp);
        }
    }
    
    // Parse the arguments from user space.
    
    if (err == 0) {
        err = FSMountParseArguments(fsmp, data);
    }
    
    // Connect to the underlying block device and set up the MFS core data structures 
    // from that device.
    
    if (err == 0) {
        err = FSMountConnectDevNode(fsmp, devvp, context);
    }
    if (err == 0) {
        err = FSMountSetupMFSCore(fsmp, context);
    }
    
    // Let the VFS layer know about the specifics of this volume.
    
    if (err == 0) {
        err = FSMountSetupVFS(fsmp);
    }
 
    if (err == 0) {
        if (fsmp->fForceFailure) {
 
            // By setting the above to true, you can force a mount failure, which 
            // allows you to test the error path.
            
            printf("MFSLives:VFSOPMount: mount succeeded, force failure\n");
            err = ENOTSUP;
        } else {
            printf("MFSLives:VFSOPMount: mount succeeded\n");
        }
    } else {
        printf("MFSLives:VFSOPMount: mount failed with error %d\n", err);
    }
    
    // If we return an error, our unmount VFSOP is never called.  Thus, we have 
    // to clean up ourselves.
    
    if (err != 0) {
        junk = VFSOPUnmount(mp, MNT_FORCE, context);
        assert(junk == 0);
    }
    
    return err;
}
 
static errno_t VFSOPStart(mount_t mp, int flags, vfs_context_t context)
    // Called by VFS to confirm the mount.
    //
    // mp is a reference to the kernel structure tracking this instance of the 
    // file system.
    //
    // flags is reserved.
    //
    // context identifies the calling process.
    //
    // This entry point isn't particularly useful; to avoid concurrency problems 
    // you should do all of your initialisation before returning from VFSOPMount.
    //
    // Moreover, it's not necessary to implement this because the kernel glue 
    // (VFS_START) ignores a NULL entry and returns ENOTSUP, and the caller ignores 
    // that error.
    // 
    // Still, I implement it just in case.
{
    // Pre-conditions
 
    assert(mp != NULL);
    AssertKnownFlags(flags, 0);
    assert(context != NULL);
    return 0;
}
 
static errno_t VFSOPUnmount(mount_t mp, int mntflags, vfs_context_t context)
    // Called by VFS to unmount a volume.  Also called by our VFSOPMount code 
    // to clean up if something goes wrong.
    //
    // mp is a reference to the kernel structure tracking this instance of the 
    // file system.
    //
    // mntflags is a set of flags; currently only MNT_FORCE is defined.
    //
    // context identifies the calling process.
{
    int             err;
    boolean_t       forcedUnmount;
    FSMount *  fsmp;
    int             flushFlags;
    
    // Pre-conditions
    
    assert(mp != NULL);
    AssertKnownFlags(mntflags, MNT_FORCE);
    assert(context != NULL);
    
    // Implementation
    
    forcedUnmount = (mntflags & MNT_FORCE) != 0;
    if (forcedUnmount) {
        flushFlags = FORCECLOSE;
    } else {
        flushFlags = 0;
    }
    
    // Prior to calling us, VFS has flushed all regular vnodes (that is, it called 
    // vflush with SKIPSWAP, SKIPSYSTEM, and SKIPROOT set).  Now we have to flush 
    // all vnodes, including the root.  If flushFlags is FORCECLOSE, this is a 
    // forced unmount (which will succeed even if there are files open on the volume). 
    // In this case, if a vnode can't be flushed, vflush will disconnect it from the 
    // mount.
    
    err = vflush(mp, NULL, flushFlags);
 
    // Clean up the file system specific data attached to the mount.
    
    if (err == 0) {
 
        // If VFSOPMount fails, it's possible for us to end up here without a 
        // valid file system specific mount record.  We skip the clean up if 
        // that happens.
        
        if ( vfs_fsprivate(mp) != NULL ) {
            fsmp = FSMountFromMount(mp);
            
            if (fsmp->fBlockDevVNode != NULL) {         // release our reference, if any
                vnode_rele(fsmp->fBlockDevVNode);
                fsmp->fBlockDevVNode = NULL;
                fsmp->fBlockRDevNum = 0;
            }
            
            if (fsmp->fMDBVABM != NULL) {
                OSFree(fsmp->fMDBVABM, fsmp->fMDBAndVABMSizeInBytes, gOSMallocTag);
            }
            
            fsmp->fMagic = kFSMountBadMagic;
            
            OSFree(fsmp, sizeof(*fsmp), gOSMallocTag);
        }
    }
 
    return err;
}
 
static errno_t VFSOPRoot(mount_t mp, struct vnode **vpp, vfs_context_t context)
    // Called by VFS to get the root vnode of this instance of the file system.
    //
    // mp is a reference to the kernel structure tracking this instance of the 
    // file system.
    //
    // vpp is a pointer to a vnode reference.  On success, we must set this to 
    // the root vnode.  We must have an I/O reference on that vnode, and it's 
    // the caller's responsibility to release it.
    // 
    // context identifies the calling process.
{
    errno_t         err;
    FSMount *       fsmp;
    vnode_t         vn;
    
    // Pre-conditions
 
    assert(mp != NULL);
    assert(vpp != NULL);
    assert(context != NULL);
 
    // Implementation -- Pretty much everything is handled by FSNodeGetOrCreateRootVNode.
    
    fsmp = FSMountFromMount(mp);
 
    vn = NULL;
    
    err = FSNodeGetOrCreateRootVNode(fsmp, &vn);
 
    // Under all circumstances we set *vpp to vn.  That way, we satisfy the 
    // post-condition, regardless of what VFS uses as the initial value for 
    // *vpp.
 
    *vpp = vn;
 
    // Post-conditions
    
    assert( (err != 0) || (*vpp != NULL) );
 
    return err;
}
 
static errno_t VFSOPGetattr(mount_t mp, struct vfs_attr *attr, vfs_context_t context)
    // Called by VFS to get information about this instance of the file system.
    //
    // mp is a reference to the kernel structure tracking this instance of the 
    // file system.
    //
    // vap describes the attributes requested and the place to store the results.
    // 
    // context identifies the calling process.
    //
    // Like VNOPGetattr, you have two macros that let you a) return values easily 
    // (VFSATTR_RETURN), and b) see if you need to return a value (VFSATTR_IS_ACTIVE).
{
    int         err;
    FSMount *   fsmp;
    fsid_t      fsid;
 
    // Pre-conditions
    
    assert(mp != NULL);
    assert(attr != NULL);
    assert(context != NULL);
    
    // Implementation
    
    fsmp = FSMountFromMount(mp);
 
    // MFS core doesn't return fsid, so we have to cook it up ourselves.
    
    fsid.val[0] = fsmp->fBlockRDevNum;
    fsid.val[1] = vfs_typenum(fsmp->fMountPoint);
 
    VFSATTR_RETURN(attr, f_fsid, fsid);
    
    // Most of the real work is done by the MFS core.
    
    err = MFSMDBGetAttr(fsmp->fMDBVABM, attr);
    
    return err;
}
 
 
static errno_t VFSOPVget(struct mount *mp, ino64_t ino, struct vnode **vpp, vfs_context_t context)
    // Called by VFS to get the vnode using its inode number.  This is a key 
    // component of the support for volfs.
    //
    // mp is a reference to the kernel structure tracking this instance of the 
    // file system.
    //
    // ino is the inode number of the item being requested.
    //
    // vpp is a pointer to a vnode reference.  On success, we must set this to 
    // the vnode for the fsobj whose inode is ino.  If this fsobj is a file, 
    // you should return the vnode for the file's data fork.  You must get an 
    // I/O reference on that vnode, and it's the caller's responsibility to 
    // release it.
    // 
    // context identifies the calling process.
{
    int         err;
    FSMount *   fsmp;
    vnode_t     vn;
    
    // Pre-conditions
 
    assert(mp != NULL);
    assert(vpp != NULL);
    assert(context != NULL);
 
    // Implementation
 
    fsmp = FSMountFromMount(mp);
 
    vn = NULL;
    
    if ( (ino & 0xFFFFFFFF00000000LL) != 0 ) {
        // If the inode number is out of the range we support (32-bits), it can't possibly 
        // by valid, so we just return an error.
        err = ENOENT;
    } else {
        size_t      forkIndex;
 
        // VFSOPVget should always return the vnode for the data fork, so we 
        // always pass a fork index of 0.
 
        forkIndex = 0;
        err = FSNodeGetOrCreateVNodeByID(fsmp, ino, forkIndex, &vn);
    }
    
    // Under all circumstances we set *vpp to vn.  That way, we satisfy the 
    // post-condition, regardless of what VFS uses as the initial value for 
    // *vpp.
 
    *vpp = vn;
    
    assert( (err == 0) == (*vpp != NULL) );
    
    return err;
}
 
/////////////////////////////////////////////////////////////////////
#pragma mark ***** Configuration Data
 
typedef errno_t (*VNodeOp)(void *);
 
// gVNodeOperationEntries is an array that describes all of the vnode operations 
// supported by vnodes created by our VFS plug-in.  This is, in turn, wrapped up 
// by gVNodeOperationVectorDesc and gVNodeOperationVectorDescList, and it's this 
// last variable that's referenced by gVFSEntry.
 
// The following is a list of all of the vnode operations supported on 
// Mac OS X 10.4, with the ones that we support uncommented.
 
static struct vnodeopv_entry_desc gVNodeOperationEntries[] = {
//  { &vnop_access_desc,        (VNodeOp) VNOPAccess      },
//  { &vnop_advlock_desc,       (VNodeOp) VNOPAdvlock     },
//  { &vnop_allocate_desc,      (VNodeOp) VNOPAllocate    },
    { &vnop_blktooff_desc,      (VNodeOp) VNOPBlktooff    },
    { &vnop_blockmap_desc,      (VNodeOp) VNOPBlockmap    },
//  { &vnop_bwrite_desc,        (VNodeOp) VNOPBwrite      },
    { &vnop_close_desc,         (VNodeOp) VNOPClose       },
//  { &vnop_copyfile_desc,      (VNodeOp) VNOPCopyfile    },
//  { &vnop_create_desc,        (VNodeOp) VNOPCreate      },
    { &vnop_default_desc,       (VNodeOp) vn_default_error},
//  { &vnop_exchange_desc,      (VNodeOp) VNOPExchange    },
//  { &vnop_fsync_desc,         (VNodeOp) VNOPFsync       },
    { &vnop_getattr_desc,       (VNodeOp) VNOPGetattr     },
//  { &vnop_getattrlist_desc,   (VNodeOp) VNOPGetattrlist },            // not useful, implement getattr instead
    { &vnop_getxattr_desc,      (VNodeOp) VNOPGetxattr    },
//  { &vnop_inactive_desc,      (VNodeOp) VNOPInactive    },
//  { &vnop_ioctl_desc,         (VNodeOp) VNOPIoctl       },
//  { &vnop_link_desc,          (VNodeOp) VNOPLink        },
    { &vnop_listxattr_desc,     (VNodeOp) VNOPListxattr   },
    { &vnop_lookup_desc,        (VNodeOp) VNOPLookup      },
//  { &vnop_mkdir_desc,         (VNodeOp) VNOPMkdir       },
//  { &vnop_mknod_desc,         (VNodeOp) VNOPMknod       },
    { &vnop_mmap_desc,          (VNodeOp) VNOPMmap        },
    { &vnop_mnomap_desc,        (VNodeOp) VNOPMnomap      },
    { &vnop_offtoblk_desc,      (VNodeOp) VNOPOfftoblk    },
    { &vnop_open_desc,          (VNodeOp) VNOPOpen        },
    { &vnop_pagein_desc,        (VNodeOp) VNOPPagein      },
//  { &vnop_pageout_desc,       (VNodeOp) VNOPPageout     },
    { &vnop_pathconf_desc,      (VNodeOp) VNOPPathconf    },
    { &vnop_read_desc,          (VNodeOp) VNOPRead        },
    { &vnop_readdir_desc,       (VNodeOp) VNOPReadDir     },
//  { &vnop_readdirattr_desc,   (VNodeOp) VNOPReaddirattr },
//  { &vnop_readlink_desc,      (VNodeOp) VNOPReadlink    },
    { &vnop_reclaim_desc,       (VNodeOp) VNOPReclaim     },
//  { &vnop_remove_desc,        (VNodeOp) VNOPRemove      },
//  { &vnop_removexattr_desc,   (VNodeOp) VNOPRemovexattr },
//  { &vnop_rename_desc,        (VNodeOp) VNOPRename      },
//  { &vnop_revoke_desc,        (VNodeOp) VNOPRevoke      },
//  { &vnop_rmdir_desc,         (VNodeOp) VNOPRmdir       },
//  { &vnop_searchfs_desc,      (VNodeOp) VNOPSearchfs    },
//  { &vnop_select_desc,        (VNodeOp) VNOPSelect      },
//  { &vnop_setattr_desc,       (VNodeOp) VNOPSetattr     },
//  { &vnop_setattrlist_desc,   (VNodeOp) VNOPSetattrlist },            // not useful, implement setattr instead
//  { &vnop_setxattr_desc,      (VNodeOp) VNOPSetxattr    },
    { &vnop_strategy_desc,      (VNodeOp) VNOPStrategy    },
//  { &vnop_symlink_desc,       (VNodeOp) VNOPSymlink     },
//  { &vnop_whiteout_desc,      (VNodeOp) VNOPWhiteout    },
//  { &vnop_write_desc,         (VNodeOp) VNOPWrite       },
    { NULL, NULL }
};
 
// gVNodeOperationVectorDesc points to our vnode operations array 
// (gVNodeOperationEntries) and to a place (gVNodeOperations) where the 
// system, on successful registration, stores a final vnode array that's 
// used to create our vnodes.
 
static struct vnodeopv_desc gVNodeOperationVectorDesc = {
    &gVNodeOperations,                          // opv_desc_vector_p
    gVNodeOperationEntries                      // opv_desc_ops
};
 
// gVNodeOperationVectorDescList is an array of vnodeopv_desc that allows us to 
// register multiple vnode operations arrays at the same time.  A full-featured 
// file system would use this to register different arrays for standard vnodes, 
// device vnodes (VBLK and VCHR), and FIFO vnodes (VFIFO).  In our case, we only 
// support standard vnodes, so our array only has one entry.
 
static struct vnodeopv_desc *gVNodeOperationVectorDescList[1] =
{
    &gVNodeOperationVectorDesc
};
 
// gVFSOps is a structure that contains pointer to all of the VFSOP routines.  
// These are routines that operate on instances of the file system (rather than 
// on vnodes).
 
static struct vfsops gVFSOps = {
    VFSOPMount,                                 // vfs_mount
    VFSOPStart,                                 // vfs_start
    VFSOPUnmount,                               // vfs_unmount
    VFSOPRoot,                                  // vfs_root
    NULL,                                       // vfs_quotactl -- only needed if you support quotes
    VFSOPGetattr,                               // vfs_getattr
    NULL,                                       // vfs_sync     -- not needed for read-only file systems
    VFSOPVget,                                  // vfs_vget
    NULL,                                       // vfs_fhtovp   -- only needed if you do NFS export
    NULL,                                       // vfs_vptofh   -- ditto
    NULL,                                       // vfs_init     -- optional
    NULL,                                       // vfs_sysctl   -- MFSLives has no custom sysctls
    NULL,                                       // vfs_setattr  -- not needed for read-only file systems
    {NULL, NULL, NULL, NULL, NULL, NULL, NULL}  // vfs_reserved
};
 
// gVFSEntry describes the overall VFS plug-in.  It's passed as a parameter 
// to vfs_fsadd to register this file system.
 
static struct vfs_fsentry gVFSEntry = {
    &gVFSOps,                                               // vfe_vfsops
    sizeof(gVNodeOperationVectorDescList) / sizeof(*gVNodeOperationVectorDescList),
                                                            // vfe_vopcnt
    gVNodeOperationVectorDescList,                          // vfe_opvdescs
    0,                                                      // vfe_fstypenum, see VFS_TBLNOTYPENUM below
    "MFSLives",                                             // vfe_fsname
                                                            // vfe_flags
          VFS_TBLTHREADSAFE             // we do our own internal locking and thus don't need funnel protection
        | VFS_TBLFSNODELOCK             // ditto
        | VFS_TBLNOTYPENUM              // we don't have a pre-defined file system type (the VT_XXX constants 
                                        // in <sys/vnode.h>); VFS should dynamically assign us a type
        | VFS_TBLLOCALVOL               // our file system is local; causes MNT_LOCAL to be set and indicates 
                                        // that the first field of our file system specific mount arguments 
                                        // is a path to a block device
        | VFS_TBL64BITREADY,            // we are 64-bit aware; our mount, ioctl and sysctl entry points 
                                        // can be called by both 32-bit and 64-bit processes; we're will use 
                                        // the type of process to interpret our arguments (if they're not 
                                        // 32/64-bit invariant)
    {NULL, NULL}                                            // vfe_reserv
};
 
static vfstable_t gVFSTableRef = NULL;
 
/////////////////////////////////////////////////////////////////////
#pragma mark ***** KEXT Load/Unload
 
// Prototypes for our main entry points to satisfy the strict error check we 
// have enabled.  We also force the symbols to be exported.
 
extern kern_return_t MODULE_START(kmod_info_t * ki, void * d);
extern kern_return_t MODULE_STOP (kmod_info_t * ki, void * d);
 
extern kern_return_t MODULE_START(kmod_info_t * ki, void * d)
    // Called by the kernel to initialise the KEXT.  The main feature of 
    // this routine is a call to vfs_fsadd to register our VFS plug-in.
{
    #pragma unused(ki)
    #pragma unused(d)
    errno_t             err;
    kern_return_t       kernErr;
    
    assert(gVFSTableRef == NULL);           // just in case we get loaded twice (which shouldn't ever happen)
    
    kernErr = InitMemoryAndLocks();
    err = ErrnoFromKernReturn(kernErr);
    if (err == 0) {
        err = HNodeInit(gLockGroup, LCK_ATTR_NULL, gOSMallocTag, kHNodeMagic, sizeof(FSNode));
    }
 
    if (err == 0) {
        err = vfs_fsadd(&gVFSEntry, &gVFSTableRef);
    }
    
    if (err != 0) {
        HNodeTerm();
        TermMemoryAndLocks();
    }
 
    return KernReturnFromErrno(err);
}
 
extern kern_return_t MODULE_STOP(kmod_info_t * ki, void * d)
    // Called by the kernel to terminate the KEXT.  The main feature of 
    // this routine is a call to vfs_fsremove to deregister our VFS plug-in. 
    // If this fails (which it will if any of our volumes mounted), the KEXT 
    // can't be unloaded.
{
    #pragma unused(ki)
    #pragma unused(d)
    errno_t             err;
 
    err = vfs_fsremove(gVFSTableRef);
    if (err == 0) {
        gVFSTableRef = NULL;
        
        HNodeTerm();
        TermMemoryAndLocks();
    }
 
    return KernReturnFromErrno(err);
}