Hi, this is another try at making read clustering work for msdosfs.
Last year, mpi@ implemented VFS read clustering for MSDOSFS in sys/msdosfs/denode.h 1.28 sys/msdosfs/msdosfs_vnops.c 1.105 This caused regressions when doing seeks past the end of the file and had. to be reverted. Then I tried to fix that in denode.h,v 1.31 msdosfs_vnops.c,v 1.114 But that caused different problems and had to be reverted, too. I have another test for that that I will commit soon. It seems that while netbsd uses DEV_BSIZE as base for the logical block numbers even in files, openbsd is more like freebsd and uses mnt_stat.f_iosize in that case. However, if the file system does accesses for metadata on the disk, it has to use DEV_BSIZE for accesses. So the fixes in 1.31/1.114 were wrong and the following diff is again based on 1.28/1.105 with some fixes in extendfile() to make seeks past the end of the file work. Testers/reviews/OKs welcome. Especially people who need msdosfs for booting via uefi. In any case, I won't commit until I get home again in a week. If my analysis with the block sizes is correct, I will add some clarifications to the bread/bwrite man page later. Cheers, Stefan diff --git a/sys/msdosfs/denode.h b/sys/msdosfs/denode.h index 38ae2027a63..4e521c6d5b8 100644 --- a/sys/msdosfs/denode.h +++ b/sys/msdosfs/denode.h @@ -142,7 +142,6 @@ struct denode { struct vnode *de_devvp; /* vnode of blk dev we live on */ uint32_t de_flag; /* flag bits */ dev_t de_dev; /* device where direntry lives */ - daddr_t de_lastr; uint32_t de_dirclust; /* cluster of the directory file containing this entry */ uint32_t de_diroffset; /* offset of this entry in the directory cluster */ uint32_t de_fndoffset; /* offset of found dir entry */ diff --git a/sys/msdosfs/msdosfs_fat.c b/sys/msdosfs/msdosfs_fat.c index 9e4675fa7e5..bee7c07f4d7 100644 --- a/sys/msdosfs/msdosfs_fat.c +++ b/sys/msdosfs/msdosfs_fat.c @@ -1021,14 +1021,12 @@ extendfile(struct denode *dep, uint32_t count, struct buf **bpp, uint32_t *ncp, bp = getblk(pmp->pm_devvp, cntobn(pmp, cn++), pmp->pm_bpcluster, 0, 0); else { - bp = getblk(DETOV(dep), de_cn2bn(pmp, frcn++), + bp = getblk(DETOV(dep), frcn++, pmp->pm_bpcluster, 0, 0); /* * Do the bmap now, as in msdosfs_write */ - if (pcbmap(dep, - de_bn2cn(pmp, bp->b_lblkno), - &bp->b_blkno, 0, 0)) + if (pcbmap(dep, bp->b_lblkno, &bp->b_blkno, 0, 0)) bp->b_blkno = -1; if (bp->b_blkno == -1) panic("extendfile: pcbmap"); diff --git a/sys/msdosfs/msdosfs_vnops.c b/sys/msdosfs/msdosfs_vnops.c index 8452e9be80c..cadaf78920d 100644 --- a/sys/msdosfs/msdosfs_vnops.c +++ b/sys/msdosfs/msdosfs_vnops.c @@ -77,13 +77,13 @@ static uint32_t fileidhash(uint64_t); +int msdosfs_bmaparray(struct vnode *, daddr_t, daddr_t *, int *); int msdosfs_kqfilter(void *); int filt_msdosfsread(struct knote *, long); int filt_msdosfswrite(struct knote *, long); int filt_msdosfsvnode(struct knote *, long); void filt_msdosfsdetach(struct knote *); - /* * Some general notes: * @@ -509,18 +509,14 @@ int msdosfs_read(void *v) { struct vop_read_args *ap = v; - int error = 0; - uint32_t diff; - int blsize; - int isadir; - uint32_t n; - long on; - daddr_t lbn, rablock, rablkno; - struct buf *bp; struct vnode *vp = ap->a_vp; struct denode *dep = VTODE(vp); struct msdosfsmount *pmp = dep->de_pmp; struct uio *uio = ap->a_uio; + int isadir, error = 0; + uint32_t n, diff, size, on; + struct buf *bp; + daddr_t cn, bn; /* * If they didn't ask for any data, then we are done. @@ -535,7 +531,8 @@ msdosfs_read(void *v) if (uio->uio_offset >= dep->de_FileSize) return (0); - lbn = de_cluster(pmp, uio->uio_offset); + cn = de_cluster(pmp, uio->uio_offset); + size = pmp->pm_bpcluster; on = uio->uio_offset & pmp->pm_crbomask; n = ulmin(pmp->pm_bpcluster - on, uio->uio_resid); @@ -548,31 +545,23 @@ msdosfs_read(void *v) if (diff < n) n = diff; - /* convert cluster # to block # if a directory */ - if (isadir) { - error = pcbmap(dep, lbn, &lbn, 0, &blsize); - if (error) - return (error); - } /* * If we are operating on a directory file then be sure to * do i/o with the vnode for the filesystem instead of the - * vnode for the directory. + * vnode for the directory. In that case, the block number + * is physical blocks and not clusters. */ if (isadir) { - error = bread(pmp->pm_devvp, lbn, blsize, &bp); + /* dirfile-relative cn -> fs-relative bn */ + error = pcbmap(dep, cn, &bn, 0, &size); + if (error) + return (error); + error = bread(pmp->pm_devvp, bn, size, &bp); } else { - rablock = lbn + 1; - rablkno = de_cn2bn(pmp, rablock); - if (dep->de_lastr + 1 == lbn && - de_cn2off(pmp, rablock) < dep->de_FileSize) - error = breadn(vp, de_cn2bn(pmp, lbn), - pmp->pm_bpcluster, &rablkno, - &pmp->pm_bpcluster, 1, &bp); + if (de_cn2off(pmp, cn + 1) >= dep->de_FileSize) + error = bread(vp, cn, size, &bp); else - error = bread(vp, de_cn2bn(pmp, lbn), - pmp->pm_bpcluster, &bp); - dep->de_lastr = lbn; + error = bread_cluster(vp, cn, size, &bp); } n = min(n, pmp->pm_bpcluster - bp->b_resid); if (error) { @@ -601,7 +590,7 @@ msdosfs_write(void *v) uint32_t osize; int error = 0; uint32_t count, lastcn; - daddr_t bn; + daddr_t cn; struct buf *bp; int ioflag = ap->a_ioflag; struct uio *uio = ap->a_uio; @@ -678,30 +667,30 @@ msdosfs_write(void *v) lastcn = de_clcount(pmp, osize) - 1; do { - if (de_cluster(pmp, uio->uio_offset) > lastcn) { + croffset = uio->uio_offset & pmp->pm_crbomask; + cn = de_cluster(pmp, uio->uio_offset); + + if (cn > lastcn) { error = ENOSPC; break; } - bn = de_blk(pmp, uio->uio_offset); - if ((uio->uio_offset & pmp->pm_crbomask) == 0 - && (de_blk(pmp, uio->uio_offset + uio->uio_resid) > de_blk(pmp, uio->uio_offset) - || uio->uio_offset + uio->uio_resid >= dep->de_FileSize)) { + if (croffset == 0 && + (de_cluster(pmp, uio->uio_offset + uio->uio_resid) > cn || + (uio->uio_offset + uio->uio_resid) >= dep->de_FileSize)) { /* * If either the whole cluster gets written, * or we write the cluster from its start beyond EOF, * then no need to read data from disk. */ - bp = getblk(thisvp, bn, pmp->pm_bpcluster, 0, 0); + bp = getblk(thisvp, cn, pmp->pm_bpcluster, 0, 0); clrbuf(bp); /* * Do the bmap now, since pcbmap needs buffers * for the fat table. (see msdosfs_strategy) */ if (bp->b_blkno == bp->b_lblkno) { - error = pcbmap(dep, - de_bn2cn(pmp, bp->b_lblkno), - &bp->b_blkno, 0, 0); + error = pcbmap(dep, bp->b_lblkno, &bp->b_blkno, 0, 0); if (error) bp->b_blkno = -1; } @@ -713,16 +702,16 @@ msdosfs_write(void *v) } } else { /* - * The block we need to write into exists, so read it in. + * The block we need to write into exists, so + * read it in. */ - error = bread(thisvp, bn, pmp->pm_bpcluster, &bp); + error = bread(thisvp, cn, pmp->pm_bpcluster, &bp); if (error) { brelse(bp); break; } } - croffset = uio->uio_offset & pmp->pm_crbomask; n = ulmin(uio->uio_resid, pmp->pm_bpcluster - croffset); if (uio->uio_offset + n > dep->de_FileSize) { dep->de_FileSize = uio->uio_offset + n; @@ -1755,7 +1744,7 @@ msdosfs_islocked(void *v) /* * vp - address of vnode file the file - * bn - which cluster we are interested in mapping to a filesystem block number. + * bn - which cluster we are interested in mapping to a filesystem block number * vpp - returns the vnode for the block special file holding the filesystem * containing the file of interest * bnp - address of where to return the filesystem relative block number @@ -1765,19 +1754,51 @@ msdosfs_bmap(void *v) { struct vop_bmap_args *ap = v; struct denode *dep = VTODE(ap->a_vp); - struct msdosfsmount *pmp = dep->de_pmp; if (ap->a_vpp != NULL) *ap->a_vpp = dep->de_devvp; if (ap->a_bnp == NULL) return (0); - if (ap->a_runp) { + + return (msdosfs_bmaparray(ap->a_vp, ap->a_bn, ap->a_bnp, ap->a_runp)); +} + +int +msdosfs_bmaparray(struct vnode *vp, daddr_t cn, daddr_t *bnp, int *runp) +{ + struct denode *dep = VTODE(vp); + struct msdosfsmount *pmp = dep->de_pmp; + struct mount *mp; + int error, maxrun = 0, run; + daddr_t daddr; + + mp = vp->v_mount; + + if (runp) { /* - * Sequential clusters should be counted here. + * XXX + * If MAXBSIZE is the largest transfer the disks can handle, + * we probably want maxrun to be 1 block less so that we + * don't create a block larger than the device can handle. */ - *ap->a_runp = 0; + *runp = 0; + maxrun = min(MAXBSIZE / mp->mnt_stat.f_iosize - 1, + pmp->pm_maxcluster - cn); } - return (pcbmap(dep, de_bn2cn(pmp, ap->a_bn), ap->a_bnp, 0, 0)); + + if ((error = pcbmap(dep, cn, bnp, 0, 0)) != 0) + return (error); + + for (run = 1; run <= maxrun; run++) { + error = pcbmap(dep, cn + run, &daddr, 0, 0); + if (error != 0 || (daddr != *bnp + de_cn2bn(pmp, run))) + break; + } + + if (runp) + *runp = run - 1; + + return (0); } int @@ -1799,8 +1820,7 @@ msdosfs_strategy(void *v) * don't allow files with holes, so we shouldn't ever see this. */ if (bp->b_blkno == bp->b_lblkno) { - error = pcbmap(dep, de_bn2cn(dep->de_pmp, bp->b_lblkno), - &bp->b_blkno, 0, 0); + error = pcbmap(dep, bp->b_lblkno, &bp->b_blkno, 0, 0); if (error) bp->b_blkno = -1; if (bp->b_blkno == -1)