Module Name:    src
Committed By:   perseant
Date:           Mon Jan  2 22:10:45 UTC 2012

Modified Files:
        src/sys/ufs/lfs: lfs.h lfs_bio.c lfs_extern.h lfs_segment.c lfs_subr.c
            lfs_syscalls.c lfs_vfsops.c lfs_vnops.c
        src/sys/ufs/ufs: inode.h ufs_readwrite.c

Log Message:
* Remove PGO_RECLAIM during lfs_putpages()' call to genfs_putpages(),
  to avoid a live lock in the latter when reclaiming a vnode with
  dirty pages.

* Add a new segment flag, SEGM_RECLAIM, to note when a segment is
  being written for vnode reclamation, and record which inode is being
  reclaimed, to aid in forensic debugging.

* Add a new segment flag, SEGM_SINGLE, so that opportunistic writes
  can write a single segment's worth of blocks and then stop, rather
  than writing all the way up to the cleaner's reserved number of
  segments.

* Add assert statements to check mutex ownership is the way it ought
  to be, mostly in lfs_putpages; fix problems uncovered by this.

* Don't clear VU_DIROP until the inode actually makes its way to disk,
  avoiding a problem where dirop inodes could become separated
  (uncovered by a modified version of the "ckckp" forensic regression
  test).

* Move the vfs_getopsbyname() call into lfs_writerd.  Prepare code to
  make lfs_writerd notice when there are no more LFSs, and exit losing
  the reference, so that, in theory, the module can be unloaded.  This
  code is not enabled, since it causes a crash on exit.

* Set IN_MODIFIED on inodes flushed by lfs_flush_dirops.  Really we
  only need to set IN_MODIFIED if we are going to write them again
  (e.g., to write pages); need to think about this more.

Finally, several changes to help avoid "no clean segments" panics:

* In lfs_bmapv, note when a vnode is loaded only to discover whether
  its blocks are live, so it can immediately be recycled.  Since the
  cleaner will try to choose ~empty segments over full ones, this
  prevents the cleaner from (1) filling the vnode cache with junk, and
  (2) squeezing any unwritten writes to disk and running the fs out of
  segments.

* Overestimate by half the amount of metadata that will be required
  to fill the clean segments.  This will make the disk appear smaller,
  but should help avoid a "no clean segments" panic.

* Rearrange lfs_writerd.  In particular, lfs_writerd now pays
  attention to the number of clean segments available, and holds off
  writing until there is room.


To generate a diff of this commit:
cvs rdiff -u -r1.134 -r1.135 src/sys/ufs/lfs/lfs.h
cvs rdiff -u -r1.120 -r1.121 src/sys/ufs/lfs/lfs_bio.c
cvs rdiff -u -r1.96 -r1.97 src/sys/ufs/lfs/lfs_extern.h
cvs rdiff -u -r1.222 -r1.223 src/sys/ufs/lfs/lfs_segment.c
cvs rdiff -u -r1.76 -r1.77 src/sys/ufs/lfs/lfs_subr.c
cvs rdiff -u -r1.139 -r1.140 src/sys/ufs/lfs/lfs_syscalls.c
cvs rdiff -u -r1.291 -r1.292 src/sys/ufs/lfs/lfs_vfsops.c
cvs rdiff -u -r1.238 -r1.239 src/sys/ufs/lfs/lfs_vnops.c
cvs rdiff -u -r1.58 -r1.59 src/sys/ufs/ufs/inode.h
cvs rdiff -u -r1.100 -r1.101 src/sys/ufs/ufs/ufs_readwrite.c

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/ufs/lfs/lfs.h
diff -u src/sys/ufs/lfs/lfs.h:1.134 src/sys/ufs/lfs/lfs.h:1.135
--- src/sys/ufs/lfs/lfs.h:1.134	Mon Jul 11 08:27:40 2011
+++ src/sys/ufs/lfs/lfs.h	Mon Jan  2 22:10:44 2012
@@ -1,4 +1,4 @@
-/*	$NetBSD: lfs.h,v 1.134 2011/07/11 08:27:40 hannken Exp $	*/
+/*	$NetBSD: lfs.h,v 1.135 2012/01/02 22:10:44 perseant Exp $	*/
 
 /*-
  * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
@@ -592,6 +592,7 @@ struct segsum_v1 {
 #define	SS_CONT		0x02		/* more partials to finish this write*/
 #define	SS_CLEAN	0x04		/* written by the cleaner */
 #define	SS_RFW		0x08		/* written by the roll-forward agent */
+#define	SS_RECLAIM	0x10		/* written by the roll-forward agent */
 	u_int16_t ss_flags;		/* 24: used for directory operations */
 	u_int16_t ss_pad;		/* 26: extra space */
 	/* FINFO's and inode daddr's... */
@@ -608,7 +609,8 @@ struct segsum {
 	u_int16_t ss_nfinfo;		/* 20: number of file info structures */
 	u_int16_t ss_ninos;		/* 22: number of inodes in summary */
 	u_int16_t ss_flags;		/* 24: used for directory operations */
-	u_int8_t  ss_pad[6];		/* 26: extra space */
+	u_int8_t  ss_pad[2];		/* 26: extra space */
+	u_int32_t ss_reclino;           /* 28: inode being reclaimed */
 	u_int64_t ss_serial;		/* 32: serial number */
 	u_int64_t ss_create;		/* 40: time stamp */
 	/* FINFO's and inode daddr's... */
@@ -840,6 +842,8 @@ struct lfs {
 	int lfs_nowrap;			/* Suspend log wrap */
 	int lfs_wrappass;		/* Allow first log wrap requester to pass */
 	int lfs_wrapstatus;		/* Wrap status */
+	int lfs_reclino;		/* Inode being reclaimed */
+	int lfs_startseg;               /* Segment we started writing at */
 	LIST_HEAD(, segdelta) lfs_segdhd;	/* List of pending trunc accounting events */
 };
 
@@ -945,13 +949,15 @@ struct segment {
 	u_int32_t seg_number;		/* number of this segment */
 	int32_t *start_lbp;		/* beginning lbn for this set */
 
-#define	SEGM_CKP	0x01		/* doing a checkpoint */
-#define	SEGM_CLEAN	0x02		/* cleaner call; don't sort */
-#define	SEGM_SYNC	0x04		/* wait for segment */
-#define	SEGM_PROT	0x08		/* don't inactivate at segunlock */
-#define SEGM_PAGEDAEMON	0x10		/* pagedaemon called us */
-#define SEGM_WRITERD	0x20		/* LFS writed called us */
-#define SEGM_FORCE_CKP	0x40		/* Force checkpoint right away */
+#define SEGM_CKP	0x0001		/* doing a checkpoint */
+#define SEGM_CLEAN	0x0002		/* cleaner call; don't sort */
+#define SEGM_SYNC	0x0004		/* wait for segment */
+#define SEGM_PROT	0x0008		/* don't inactivate at segunlock */
+#define SEGM_PAGEDAEMON	0x0010		/* pagedaemon called us */
+#define SEGM_WRITERD	0x0020		/* LFS writed called us */
+#define SEGM_FORCE_CKP	0x0040		/* Force checkpoint right away */
+#define SEGM_RECLAIM	0x0080		/* Writing to reclaim vnode */
+#define SEGM_SINGLE	0x0100		/* Opportunistic writevnodes */
 	u_int16_t seg_flags;		/* run-time flags for this segment */
 	u_int32_t seg_iocount;		/* number of ios pending */
 	int	  ndupino;		/* number of duplicate inodes */
@@ -992,6 +998,7 @@ struct lfs_inode_ext {
 #define LFSI_DELETED      0x02
 #define LFSI_WRAPBLOCK    0x04
 #define LFSI_WRAPWAIT     0x08
+#define LFSI_BMAP         0x10
 	u_int32_t lfs_iflags;           /* Inode flags */
 	daddr_t   lfs_hiblk;		/* Highest lbn held by inode */
 #ifdef _KERNEL
@@ -1017,10 +1024,16 @@ struct lfs_inode_ext {
  * Macros for determining free space on the disk, with the variable metadata
  * of segment summaries and inode blocks taken into account.
  */
-/* Estimate number of clean blocks not available for writing */
-#define LFS_EST_CMETA(F) (int32_t)((((F)->lfs_dmeta *			     \
-				     (int64_t)(F)->lfs_nclean) /	     \
-				      ((F)->lfs_nseg - (F)->lfs_nclean)))
+/*
+ * Estimate number of clean blocks not available for writing because
+ * they will contain metadata or overhead.  This is calculated as
+ * (dmeta / # dirty segments) * (# clean segments).
+ */
+#define CM_MAG_NUM 3
+#define CM_MAG_DEN 2
+#define LFS_EST_CMETA(F) (int32_t)((					\
+				    (CM_MAG_NUM * ((F)->lfs_dmeta * (int64_t)(F)->lfs_nclean)) / \
+				    (CM_MAG_DEN * ((F)->lfs_nseg - (F)->lfs_nclean))))
 
 /* Estimate total size of the disk not including metadata */
 #define LFS_EST_NONMETA(F) ((F)->lfs_dsize - (F)->lfs_dmeta - LFS_EST_CMETA(F))

Index: src/sys/ufs/lfs/lfs_bio.c
diff -u src/sys/ufs/lfs/lfs_bio.c:1.120 src/sys/ufs/lfs/lfs_bio.c:1.121
--- src/sys/ufs/lfs/lfs_bio.c:1.120	Mon Jul 11 08:27:40 2011
+++ src/sys/ufs/lfs/lfs_bio.c	Mon Jan  2 22:10:44 2012
@@ -1,4 +1,4 @@
-/*	$NetBSD: lfs_bio.c,v 1.120 2011/07/11 08:27:40 hannken Exp $	*/
+/*	$NetBSD: lfs_bio.c,v 1.121 2012/01/02 22:10:44 perseant Exp $	*/
 
 /*-
  * Copyright (c) 1999, 2000, 2001, 2002, 2003, 2008 The NetBSD Foundation, Inc.
@@ -60,7 +60,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: lfs_bio.c,v 1.120 2011/07/11 08:27:40 hannken Exp $");
+__KERNEL_RCSID(0, "$NetBSD: lfs_bio.c,v 1.121 2012/01/02 22:10:44 perseant Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -96,6 +96,7 @@ int	lfs_subsys_pages     = 0L;	/* Total 
 int	lfs_fs_pagetrip	     = 0;	/* # of pages to trip per-fs write */
 int	lfs_writing	     = 0;	/* Set if already kicked off a writer
 					   because of buffer space */
+int	locked_queue_waiters = 0;	/* Number of processes waiting on lq */
 
 /* Lock and condition variables for above. */
 kcondvar_t	locked_queue_cv;
@@ -160,8 +161,12 @@ lfs_reservebuf(struct lfs *fs, struct vn
 
 		lfs_flush(fs, 0, 0);
 
+		DLOG((DLOG_AVAIL, "lfs_reservebuf: waiting: count=%d, bytes=%ld\n",
+		      locked_queue_count, locked_queue_bytes));
+		++locked_queue_waiters;
 		error = cv_timedwait_sig(&locked_queue_cv, &lfs_lock,
 		    hz * LFS_BUFWAIT);
+		--locked_queue_waiters;
 		if (error && error != EWOULDBLOCK) {
 			mutex_exit(&lfs_lock);
 			return error;
@@ -171,8 +176,11 @@ lfs_reservebuf(struct lfs *fs, struct vn
 	locked_queue_rcount += n;
 	locked_queue_rbytes += bytes;
 
-	if (n < 0)
+	if (n < 0 && locked_queue_waiters > 0) {
+		DLOG((DLOG_AVAIL, "lfs_reservebuf: broadcast: count=%d, bytes=%ld\n",
+		      locked_queue_count, locked_queue_bytes));
 		cv_broadcast(&locked_queue_cv);
+	}
 
 	mutex_exit(&lfs_lock);
 
@@ -461,7 +469,7 @@ lfs_bwrite_ext(struct buf *bp, int flags
 	 */
 	if (fs->lfs_ronly || (fs->lfs_pflags & LFS_PF_CLEAN)) {
 		bp->b_oflags &= ~BO_DELWRI;
-		bp->b_flags |= B_READ;
+		bp->b_flags |= B_READ; /* XXX is this right? --ks */
 		bp->b_error = 0;
 		mutex_enter(&bufcache_lock);
 		LFS_UNLOCK_BUF(bp);
@@ -535,6 +543,7 @@ lfs_flush_fs(struct lfs *fs, int flags)
 	if (lfs_dostats)
 		++lfs_stats.flush_invoked;
 
+	fs->lfs_pdflush = 0;
 	mutex_exit(&lfs_lock);
 	lfs_writer_enter(fs, "fldirop");
 	lfs_segwrite(fs->lfs_ivnode->v_mount, flags);
@@ -689,10 +698,10 @@ lfs_check(struct vnode *vp, daddr_t blkn
 	/* If there are too many pending dirops, we have to flush them. */
 	if (fs->lfs_dirvcount > LFS_MAX_FSDIROP(fs) ||
 	    lfs_dirvcount > LFS_MAX_DIROP || fs->lfs_diropwait > 0) {
-		flags |= SEGM_CKP;
-	}
-
-	if (locked_queue_count + INOCOUNT(fs) > LFS_MAX_BUFS ||
+		mutex_exit(&lfs_lock);
+		lfs_flush_dirops(fs);
+		mutex_enter(&lfs_lock);
+	} else if (locked_queue_count + INOCOUNT(fs) > LFS_MAX_BUFS ||
 	    locked_queue_bytes + INOBYTES(fs) > LFS_MAX_BYTES ||
 	    lfs_subsys_pages > LFS_MAX_PAGES ||
 	    fs->lfs_dirvcount > LFS_MAX_FSDIROP(fs) ||
@@ -717,8 +726,10 @@ lfs_check(struct vnode *vp, daddr_t blkn
 			++lfs_stats.wait_exceeded;
 		DLOG((DLOG_AVAIL, "lfs_check: waiting: count=%d, bytes=%ld\n",
 		      locked_queue_count, locked_queue_bytes));
+		++locked_queue_waiters;
 		error = cv_timedwait_sig(&locked_queue_cv, &lfs_lock,
 		    hz * LFS_BUFWAIT);
+		--locked_queue_waiters;
 		if (error != EWOULDBLOCK)
 			break;
 

Index: src/sys/ufs/lfs/lfs_extern.h
diff -u src/sys/ufs/lfs/lfs_extern.h:1.96 src/sys/ufs/lfs/lfs_extern.h:1.97
--- src/sys/ufs/lfs/lfs_extern.h:1.96	Sat Jun 28 01:34:05 2008
+++ src/sys/ufs/lfs/lfs_extern.h	Mon Jan  2 22:10:44 2012
@@ -1,4 +1,4 @@
-/*	$NetBSD: lfs_extern.h,v 1.96 2008/06/28 01:34:05 rumble Exp $	*/
+/*	$NetBSD: lfs_extern.h,v 1.97 2012/01/02 22:10:44 perseant Exp $	*/
 
 /*-
  * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
@@ -240,8 +240,8 @@ int lfs_gop_alloc(struct vnode *, off_t,
 void lfs_gop_size(struct vnode *, off_t, off_t *, int);
 int lfs_putpages_ext(void *, int);
 int lfs_gatherpages(struct vnode *);
-void lfs_flush_dirops(struct lfs *);
-void lfs_flush_pchain(struct lfs *);
+int lfs_flush_dirops(struct lfs *);
+int lfs_flush_pchain(struct lfs *);
 
 int lfs_bwrite	 (void *);
 int lfs_fsync	 (void *);

Index: src/sys/ufs/lfs/lfs_segment.c
diff -u src/sys/ufs/lfs/lfs_segment.c:1.222 src/sys/ufs/lfs/lfs_segment.c:1.223
--- src/sys/ufs/lfs/lfs_segment.c:1.222	Mon Jul 11 08:27:40 2011
+++ src/sys/ufs/lfs/lfs_segment.c	Mon Jan  2 22:10:44 2012
@@ -1,4 +1,4 @@
-/*	$NetBSD: lfs_segment.c,v 1.222 2011/07/11 08:27:40 hannken Exp $	*/
+/*	$NetBSD: lfs_segment.c,v 1.223 2012/01/02 22:10:44 perseant Exp $	*/
 
 /*-
  * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
@@ -60,7 +60,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: lfs_segment.c,v 1.222 2011/07/11 08:27:40 hannken Exp $");
+__KERNEL_RCSID(0, "$NetBSD: lfs_segment.c,v 1.223 2012/01/02 22:10:44 perseant Exp $");
 
 #ifdef DEBUG
 # define vndebug(vp, str) do {						\
@@ -202,6 +202,9 @@ lfs_vflush(struct vnode *vp)
 	relock = 0;
 
     top:
+	KASSERT(mutex_owned(vp->v_interlock) == false);
+	KASSERT(mutex_owned(&lfs_lock) == false);
+	KASSERT(mutex_owned(&bufcache_lock) == false);
 	ASSERT_NO_SEGLOCK(fs);
 	if (ip->i_flag & IN_CLEANING) {
 		ivndebug(vp,"vflush/in_cleaning");
@@ -280,7 +283,10 @@ lfs_vflush(struct vnode *vp)
 	mutex_exit(vp->v_interlock);
 
 	/* Protect against VI_XLOCK deadlock in vinvalbuf() */
-	lfs_seglock(fs, SEGM_SYNC);
+	lfs_seglock(fs, SEGM_SYNC | ((vp->v_iflag & VI_XLOCK) ? SEGM_RECLAIM : 0));
+	if (vp->v_iflag & VI_XLOCK) {
+		fs->lfs_reclino = ip->i_number;
+	}
 
 	/* If we're supposed to flush a freed inode, just toss it */
 	if (ip->i_lfs_iflags & LFSI_DELETED) {
@@ -380,11 +386,12 @@ lfs_vflush(struct vnode *vp)
 		do {
 			if (LIST_FIRST(&vp->v_dirtyblkhd) != NULL) {
 				relock = lfs_writefile(fs, sp, vp);
-				if (relock) {
+				if (relock && vp != fs->lfs_ivnode) {
 					/*
 					 * Might have to wait for the
 					 * cleaner to run; but we're
 					 * still not done with this vnode.
+					 * XXX we can do better than this.
 					 */
 					KDASSERT(ip->i_number != LFS_IFILE_INUM);
 					lfs_writeinode(fs, sp, ip);
@@ -486,9 +493,16 @@ lfs_writevnodes(struct lfs *fs, struct m
 			 * After this, pages might be busy
 			 * due to our own previous putpages.
 			 * Start actual segment write here to avoid deadlock.
+			 * If we were just writing one segment and we've done
+			 * that, break out.
 			 */
 			mutex_exit(&mntvnode_lock);
-			(void)lfs_writeseg(fs, sp);
+			if (lfs_writeseg(fs, sp) &&
+			    (sp->seg_flags & SEGM_SINGLE) &&
+			    fs->lfs_curseg != fs->lfs_startseg) {
+				DLOG((DLOG_VNODE, "lfs_writevnodes: breaking out of segment write at daddr 0x%x\n", fs->lfs_offset));
+				break;
+			}
 			goto loop;
 		}
 
@@ -626,6 +640,10 @@ lfs_segwrite(struct mount *mp, int flags
 	 */
 	do_ckp = LFS_SHOULD_CHECKPOINT(fs, flags);
 
+	/* We can't do a partial write and checkpoint at the same time. */
+	if (do_ckp)
+		flags &= ~SEGM_SINGLE;
+
 	lfs_seglock(fs, flags | (do_ckp ? SEGM_CKP : 0));
 	sp = fs->lfs_sp;
 	if (sp->seg_flags & (SEGM_CLEAN | SEGM_CKP))
@@ -645,6 +663,11 @@ lfs_segwrite(struct mount *mp, int flags
 	else if (!(sp->seg_flags & SEGM_FORCE_CKP)) {
 		do {
 			um_error = lfs_writevnodes(fs, mp, sp, VN_REG);
+			if ((sp->seg_flags & SEGM_SINGLE) &&
+			    fs->lfs_curseg != fs->lfs_startseg) {
+				DLOG((DLOG_SEG, "lfs_segwrite: breaking out of segment write at daddr 0x%x\n", fs->lfs_offset));
+				break;
+			}
 
 			if (do_ckp || fs->lfs_dirops == 0) {
 				if (!writer_set) {
@@ -1025,6 +1048,7 @@ lfs_writeinode(struct lfs *fs, struct se
 {
 	struct buf *bp;
 	struct ufs1_dinode *cdp;
+	struct vnode *vp = ITOV(ip);
 	daddr_t daddr;
 	int32_t *daddrp;	/* XXX ondisk32 */
 	int i, ndx;
@@ -1033,7 +1057,7 @@ lfs_writeinode(struct lfs *fs, struct se
 	int count;
 
 	ASSERT_SEGLOCK(fs);
-	if (!(ip->i_flag & IN_ALLMOD))
+	if (!(ip->i_flag & IN_ALLMOD) && !(vp->v_uflag & VU_DIROP))
 		return (0);
 
 	/* Can't write ifile when writer is not set */
@@ -1047,7 +1071,7 @@ lfs_writeinode(struct lfs *fs, struct se
 	 * solid.
 	 */
 	count = 0;
-	while (ip->i_number == LFS_IFILE_INUM) {
+	while (vp == fs->lfs_ivnode) {
 		int redo = 0;
 
 		if (sp->idp == NULL && sp->ibp == NULL &&
@@ -1112,7 +1136,7 @@ lfs_writeinode(struct lfs *fs, struct se
 	}
 
 	/* Check VU_DIROP in case there is a new file with no data blocks */
-	if (ITOV(ip)->v_uflag & VU_DIROP)
+	if (vp->v_uflag & VU_DIROP)
 		((SEGSUM *)(sp->segsum))->ss_flags |= (SS_DIROP|SS_CONT);
 
 	/* Update the inode times and copy the inode onto the inode page. */
@@ -1139,6 +1163,18 @@ lfs_writeinode(struct lfs *fs, struct se
 	*cdp = *ip->i_din.ffs1_din;
 
 	/*
+	 * This inode is on its way to disk; clear its VU_DIROP status when
+	 * the write is complete.
+	 */
+	if (vp->v_uflag & VU_DIROP) {
+		if (!(sp->seg_flags & SEGM_CLEAN))
+			ip->i_flag |= IN_CDIROP;
+		else {
+			DLOG((DLOG_DIROP, "lfs_writeinode: not clearing dirop for cleaned ino %d\n", (int)ip->i_number));
+		}
+	}
+
+	/*
 	 * If cleaning, link counts and directory file sizes cannot change,
 	 * since those would be directory operations---even if the file
 	 * we are writing is marked VU_DIROP we should write the old values.
@@ -1146,9 +1182,9 @@ lfs_writeinode(struct lfs *fs, struct se
 	 * current values the next time we clean.
 	 */
 	if (sp->seg_flags & SEGM_CLEAN) {
-		if (ITOV(ip)->v_uflag & VU_DIROP) {
+		if (vp->v_uflag & VU_DIROP) {
 			cdp->di_nlink = ip->i_lfs_odnlink;
-			/* if (ITOV(ip)->v_type == VDIR) */
+			/* if (vp->v_type == VDIR) */
 			cdp->di_size = ip->i_lfs_osize;
 		}
 	} else {
@@ -1988,6 +2024,12 @@ lfs_writeseg(struct lfs *fs, struct segm
 	if (sp->seg_flags & SEGM_CLEAN)
 		ssp->ss_flags |= SS_CLEAN;
 
+	/* Note if we are writing to reclaim */
+	if (sp->seg_flags & SEGM_RECLAIM) {
+		ssp->ss_flags |= SS_RECLAIM;
+		ssp->ss_reclino = fs->lfs_reclino;
+	}
+
 	devvp = VTOI(fs->lfs_ivnode)->i_devvp;
 
 	/* Update the segment usage information. */
@@ -2720,7 +2762,6 @@ lfs_shellsort(struct buf **bp_array, int
 int
 lfs_vref(struct vnode *vp)
 {
-	int error;
 	struct lfs *fs;
 
 	KASSERT(mutex_owned(vp->v_interlock));
@@ -2734,12 +2775,13 @@ lfs_vref(struct vnode *vp)
 	 * being able to flush all of the pages from this vnode, which
 	 * will cause it to panic.  So, return 0 if a flush is in progress.
 	 */
-	error = vget(vp, LK_NOWAIT);
-	if (error == EBUSY && IS_FLUSHING(VTOI(vp)->i_lfs, vp)) {
-		++fs->lfs_flushvp_fakevref;
-		return 0;
-	}
-	return error;
+	if (IS_FLUSHING(VTOI(vp)->i_lfs, vp)) {
+ 		++fs->lfs_flushvp_fakevref;
+		mutex_exit(vp->v_interlock);
+ 		return 0;
+ 	}
+
+	return vget(vp, LK_NOWAIT);
 }
 
 /*

Index: src/sys/ufs/lfs/lfs_subr.c
diff -u src/sys/ufs/lfs/lfs_subr.c:1.76 src/sys/ufs/lfs/lfs_subr.c:1.77
--- src/sys/ufs/lfs/lfs_subr.c:1.76	Fri Jun 25 10:03:52 2010
+++ src/sys/ufs/lfs/lfs_subr.c	Mon Jan  2 22:10:44 2012
@@ -1,4 +1,4 @@
-/*	$NetBSD: lfs_subr.c,v 1.76 2010/06/25 10:03:52 hannken Exp $	*/
+/*	$NetBSD: lfs_subr.c,v 1.77 2012/01/02 22:10:44 perseant Exp $	*/
 
 /*-
  * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
@@ -60,7 +60,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: lfs_subr.c,v 1.76 2010/06/25 10:03:52 hannken Exp $");
+__KERNEL_RCSID(0, "$NetBSD: lfs_subr.c,v 1.77 2012/01/02 22:10:44 perseant Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -335,6 +335,7 @@ lfs_seglock(struct lfs *fs, unsigned lon
 	 */
 	mutex_enter(&lfs_lock);
 	++fs->lfs_iocount;
+	fs->lfs_startseg = fs->lfs_curseg;
 	mutex_exit(&lfs_lock);
 	return 0;
 }
@@ -361,7 +362,7 @@ lfs_unmark_dirop(struct lfs *fs)
 	for (ip = TAILQ_FIRST(&fs->lfs_dchainhd); ip != NULL; ip = nip) {
 		nip = TAILQ_NEXT(ip, i_lfs_dchain);
 		vp = ITOV(ip);
-		if ((VTOI(vp)->i_flag & (IN_ADIROP | IN_ALLMOD)) == 0) {
+		if ((ip->i_flag & (IN_ADIROP | IN_CDIROP)) == IN_CDIROP) {
 			--lfs_dirvcount;
 			--fs->lfs_dirvcount;
 			vp->v_uflag &= ~VU_DIROP;
@@ -372,6 +373,7 @@ lfs_unmark_dirop(struct lfs *fs)
 			vrele(vp);
 			mutex_enter(&lfs_lock);
 			fs->lfs_unlockvp = NULL;
+			ip->i_flag &= ~IN_CDIROP;
 		}
 	}
 
@@ -437,8 +439,7 @@ lfs_segunlock(struct lfs *fs)
 	mutex_enter(&lfs_lock);
 	KASSERT(LFS_SEGLOCK_HELD(fs));
 	if (fs->lfs_seglock == 1) {
-		if ((sp->seg_flags & (SEGM_PROT | SEGM_CLEAN)) == 0 &&
-		    LFS_STARVED_FOR_SEGS(fs) == 0)
+		if ((sp->seg_flags & (SEGM_PROT | SEGM_CLEAN)) == 0)
 			do_unmark_dirop = 1;
 		mutex_exit(&lfs_lock);
 		sync = sp->seg_flags & SEGM_SYNC;

Index: src/sys/ufs/lfs/lfs_syscalls.c
diff -u src/sys/ufs/lfs/lfs_syscalls.c:1.139 src/sys/ufs/lfs/lfs_syscalls.c:1.140
--- src/sys/ufs/lfs/lfs_syscalls.c:1.139	Sun Jun 12 03:36:01 2011
+++ src/sys/ufs/lfs/lfs_syscalls.c	Mon Jan  2 22:10:45 2012
@@ -1,4 +1,4 @@
-/*	$NetBSD: lfs_syscalls.c,v 1.139 2011/06/12 03:36:01 rmind Exp $	*/
+/*	$NetBSD: lfs_syscalls.c,v 1.140 2012/01/02 22:10:45 perseant Exp $	*/
 
 /*-
  * Copyright (c) 1999, 2000, 2001, 2002, 2003, 2007, 2007, 2008
@@ -61,7 +61,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: lfs_syscalls.c,v 1.139 2011/06/12 03:36:01 rmind Exp $");
+__KERNEL_RCSID(0, "$NetBSD: lfs_syscalls.c,v 1.140 2012/01/02 22:10:45 perseant Exp $");
 
 #ifndef LFS
 # define LFS		/* for prototypes in syscallargs.h */
@@ -291,6 +291,17 @@ lfs_markv(struct proc *p, fsid_t *fsidp,
 			 */
 			if (v_daddr != LFS_UNUSED_DADDR) {
 				lfs_vunref(vp);
+				/*
+				 * If the vnode has LFSI_BMAP, it was
+				 * not found in the cache.  Dump it so
+				 * we can reuse the vnode.
+				 * XXX If we knew what segment we were
+				 * XXX supposed to be looking for, we
+				 * XXX would be able to be more selective
+				 * XXX here.
+				 */
+				if (ip->i_lfs_iflags & LFSI_BMAP)
+					vrecycle(vp, NULL, NULL);
 				numrefed--;
 			}
 
@@ -760,6 +771,7 @@ lfs_bmapv(struct proc *p, fsid_t *fsidp,
 					continue;
 				} else {
 					KASSERT(VOP_ISLOCKED(vp));
+					VTOI(vp)->i_lfs_iflags |= LFSI_BMAP;
 					VOP_UNLOCK(vp);
 					numrefed++;
 				}
@@ -814,6 +826,9 @@ lfs_bmapv(struct proc *p, fsid_t *fsidp,
 	 */
 	if (v_daddr != LFS_UNUSED_DADDR) {
 		lfs_vunref(vp);
+		/* Recycle as above. */
+		if (ip->i_lfs_iflags & LFSI_BMAP)
+			vrecycle(vp, NULL, NULL);
 		numrefed--;
 	}
 

Index: src/sys/ufs/lfs/lfs_vfsops.c
diff -u src/sys/ufs/lfs/lfs_vfsops.c:1.291 src/sys/ufs/lfs/lfs_vfsops.c:1.292
--- src/sys/ufs/lfs/lfs_vfsops.c:1.291	Mon Nov 14 18:35:14 2011
+++ src/sys/ufs/lfs/lfs_vfsops.c	Mon Jan  2 22:10:45 2012
@@ -1,4 +1,4 @@
-/*	$NetBSD: lfs_vfsops.c,v 1.291 2011/11/14 18:35:14 hannken Exp $	*/
+/*	$NetBSD: lfs_vfsops.c,v 1.292 2012/01/02 22:10:45 perseant Exp $	*/
 
 /*-
  * Copyright (c) 1999, 2000, 2001, 2002, 2003, 2007, 2007
@@ -61,7 +61,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: lfs_vfsops.c,v 1.291 2011/11/14 18:35:14 hannken Exp $");
+__KERNEL_RCSID(0, "$NetBSD: lfs_vfsops.c,v 1.292 2012/01/02 22:10:45 perseant Exp $");
 
 #if defined(_KERNEL_OPT)
 #include "opt_lfs.h"
@@ -129,6 +129,7 @@ extern const struct vnodeopv_desc lfs_sp
 extern const struct vnodeopv_desc lfs_fifoop_opv_desc;
 
 pid_t lfs_writer_daemon = 0;
+lwpid_t lfs_writer_lid = 0;
 int lfs_do_flush = 0;
 #ifdef LFS_KERNEL_RFW
 int lfs_do_rfw = 0;
@@ -399,85 +400,151 @@ struct pool lfs_lbnentry_pool;
 static void
 lfs_writerd(void *arg)
 {
-	struct mount *mp, *nmp;
-	struct lfs *fs;
-	int fsflags;
-	int loopcount;
-
-	lfs_writer_daemon = curproc->p_pid;
-
+ 	struct mount *mp, *nmp;
+ 	struct lfs *fs;
+	struct vfsops *vfs = NULL;
+ 	int fsflags;
+ 	int loopcount;
+	int skipc;
+	int lfsc;
+	int wrote_something = 0;
+ 
 	mutex_enter(&lfs_lock);
-	for (;;) {
-		mtsleep(&lfs_writer_daemon, PVM | PNORELOCK, "lfswriter", hz/10,
-		    &lfs_lock);
+ 	lfs_writer_daemon = curproc->p_pid;
+	lfs_writer_lid = curlwp->l_lid;
+	mutex_exit(&lfs_lock);
 
-		/*
-		 * Look through the list of LFSs to see if any of them
-		 * have requested pageouts.
-		 */
-		mutex_enter(&mountlist_lock);
-		for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist;
-		     mp = nmp) {
-			if (vfs_busy(mp, &nmp)) {
-				continue;
-			}
-			if (strncmp(mp->mnt_stat.f_fstypename, MOUNT_LFS,
-			    sizeof(mp->mnt_stat.f_fstypename)) == 0) {
-				fs = VFSTOUFS(mp)->um_lfs;
-				mutex_enter(&lfs_lock);
-				fsflags = 0;
-				if ((fs->lfs_dirvcount > LFS_MAX_FSDIROP(fs) ||
-				     lfs_dirvcount > LFS_MAX_DIROP) &&
-				    fs->lfs_dirops == 0)
-					fsflags |= SEGM_CKP;
-				if (fs->lfs_pdflush) {
-					DLOG((DLOG_FLUSH, "lfs_writerd: pdflush set\n"));
-					fs->lfs_pdflush = 0;
-					lfs_flush_fs(fs, fsflags);
-					mutex_exit(&lfs_lock);
-				} else if (!TAILQ_EMPTY(&fs->lfs_pchainhd)) {
-					DLOG((DLOG_FLUSH, "lfs_writerd: pchain non-empty\n"));
-					mutex_exit(&lfs_lock);
-					lfs_writer_enter(fs, "wrdirop");
-					lfs_flush_pchain(fs);
-					lfs_writer_leave(fs);
-				} else
-					mutex_exit(&lfs_lock);
-			}
-			vfs_unbusy(mp, false, &nmp);
-		}
-		mutex_exit(&mountlist_lock);
+	/* Take an extra reference to the LFS vfsops. */
+	vfs = vfs_getopsbyname(MOUNT_LFS);
+ 
+ 	mutex_enter(&lfs_lock);
+ 	for (;;) {
+		KASSERT(mutex_owned(&lfs_lock));
+		if (wrote_something == 0)
+			mtsleep(&lfs_writer_daemon, PVM, "lfswriter", hz/10 + 1,
+				&lfs_lock);
+
+		KASSERT(mutex_owned(&lfs_lock));
+		loopcount = 0;
+		wrote_something = 0;
 
 		/*
 		 * If global state wants a flush, flush everything.
 		 */
-		mutex_enter(&lfs_lock);
-		loopcount = 0;
 		if (lfs_do_flush || locked_queue_count > LFS_MAX_BUFS ||
 			locked_queue_bytes > LFS_MAX_BYTES ||
 			lfs_subsys_pages > LFS_MAX_PAGES) {
 
 			if (lfs_do_flush) {
-				DLOG((DLOG_FLUSH, "daemon: lfs_do_flush\n"));
+				DLOG((DLOG_FLUSH, "lfs_writerd: lfs_do_flush\n"));
 			}
 			if (locked_queue_count > LFS_MAX_BUFS) {
-				DLOG((DLOG_FLUSH, "daemon: lqc = %d, max %d\n",
+				DLOG((DLOG_FLUSH, "lfs_writerd: lqc = %d, max %d\n",
 				      locked_queue_count, LFS_MAX_BUFS));
 			}
 			if (locked_queue_bytes > LFS_MAX_BYTES) {
-				DLOG((DLOG_FLUSH, "daemon: lqb = %ld, max %ld\n",
+				DLOG((DLOG_FLUSH, "lfs_writerd: lqb = %ld, max %ld\n",
 				      locked_queue_bytes, LFS_MAX_BYTES));
 			}
 			if (lfs_subsys_pages > LFS_MAX_PAGES) {
-				DLOG((DLOG_FLUSH, "daemon: lssp = %d, max %d\n",
+				DLOG((DLOG_FLUSH, "lfs_writerd: lssp = %d, max %d\n",
 				      lfs_subsys_pages, LFS_MAX_PAGES));
 			}
 
 			lfs_flush(NULL, SEGM_WRITERD, 0);
 			lfs_do_flush = 0;
+			KASSERT(mutex_owned(&lfs_lock));
+			continue;
 		}
-	}
-	/* NOTREACHED */
+		KASSERT(mutex_owned(&lfs_lock));
+		mutex_exit(&lfs_lock);
+ 
+ 		/*
+ 		 * Look through the list of LFSs to see if any of them
+ 		 * have requested pageouts.
+ 		 */
+ 		mutex_enter(&mountlist_lock);
+		lfsc = 0;
+		skipc = 0;
+ 		for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist;
+ 		     mp = nmp) {
+ 			if (vfs_busy(mp, &nmp)) {
+				++skipc;
+ 				continue;
+ 			}
+			KASSERT(!mutex_owned(&lfs_lock));
+ 			if (strncmp(mp->mnt_stat.f_fstypename, MOUNT_LFS,
+ 			    sizeof(mp->mnt_stat.f_fstypename)) == 0) {
+				++lfsc;
+ 				fs = VFSTOUFS(mp)->um_lfs;
+				int32_t ooffset = 0;
+				fsflags = SEGM_SINGLE;
+
+ 				mutex_enter(&lfs_lock);
+				ooffset = fs->lfs_offset;
+
+				if (fs->lfs_nextseg < fs->lfs_curseg && fs->lfs_nowrap) {
+					/* Don't try to write if we're suspended */
+					mutex_exit(&lfs_lock);
+					vfs_unbusy(mp, false, &nmp);
+					continue;
+				}
+				if (LFS_STARVED_FOR_SEGS(fs)) {
+					mutex_exit(&lfs_lock);
+
+					DLOG((DLOG_FLUSH, "lfs_writerd: need cleaning before writing possible\n"));
+					lfs_wakeup_cleaner(fs);
+					vfs_unbusy(mp, false, &nmp);
+					continue;
+				}
+
+ 				if ((fs->lfs_dirvcount > LFS_MAX_FSDIROP(fs) ||
+ 				     lfs_dirvcount > LFS_MAX_DIROP) &&
+				    fs->lfs_dirops == 0) {
+					fsflags &= ~SEGM_SINGLE;
+ 					fsflags |= SEGM_CKP;
+					DLOG((DLOG_FLUSH, "lfs_writerd: checkpoint\n"));
+					lfs_flush_fs(fs, fsflags);
+				} else if (fs->lfs_pdflush) {
+ 					DLOG((DLOG_FLUSH, "lfs_writerd: pdflush set\n"));
+ 					lfs_flush_fs(fs, fsflags);
+ 				} else if (!TAILQ_EMPTY(&fs->lfs_pchainhd)) {
+ 					DLOG((DLOG_FLUSH, "lfs_writerd: pchain non-empty\n"));
+ 					mutex_exit(&lfs_lock);
+ 					lfs_writer_enter(fs, "wrdirop");
+ 					lfs_flush_pchain(fs);
+ 					lfs_writer_leave(fs);
+					mutex_enter(&lfs_lock);
+				}
+				if (fs->lfs_offset != ooffset)
+					++wrote_something;
+				mutex_exit(&lfs_lock);
+ 			}
+			KASSERT(!mutex_owned(&lfs_lock));
+ 			vfs_unbusy(mp, false, &nmp);
+ 		}
+		if (lfsc + skipc == 0) {
+#ifdef notyet
+			mutex_enter(&lfs_lock);
+			lfs_writer_daemon = 0;
+			lfs_writer_lid = 0;
+			mutex_exit(&lfs_lock);
+			mutex_exit(&mountlist_lock);
+			break;
+#endif
+		}
+ 		mutex_exit(&mountlist_lock);
+ 
+ 		mutex_enter(&lfs_lock);
+ 	}
+	KASSERT(!mutex_owned(&lfs_lock));
+	KASSERT(!mutex_owned(&mountlist_lock));
+
+	/* Give up our extra reference so the module can be unloaded. */
+	mutex_enter(&vfs_list_lock);
+	if (vfs != NULL)
+		vfs->vfs_refcount--;
+	mutex_exit(&vfs_list_lock);
 }
 
 /*
@@ -1063,16 +1130,12 @@ lfs_mountfs(struct vnode *devvp, struct 
 	vput(vp);
 
 	/* Start the pagedaemon-anticipating daemon */
-	if (lfs_writer_daemon == 0 && kthread_create(PRI_BIO, 0, NULL,
+	mutex_enter(&lfs_lock);
+	if (lfs_writer_daemon == 0 && lfs_writer_lid == 0 &&
+	    kthread_create(PRI_BIO, 0, NULL,
 	    lfs_writerd, NULL, NULL, "lfs_writer") != 0)
 		panic("fork lfs_writer");
-	/*
-	 * XXX: Get extra reference to LFS vfsops.  This prevents unload,
-	 * but also prevents kernel panic due to text being unloaded
-	 * from below lfs_writerd.  When lfs_writerd can exit, remove
-	 * this!!!
-	 */
-	vfs_getopsbyname(MOUNT_LFS);
+	mutex_exit(&lfs_lock);
 
 	printf("WARNING: the log-structured file system is experimental\n"
 	    "WARNING: it may cause system crashes and/or corrupt data\n");
@@ -1576,6 +1639,7 @@ lfs_gop_write(struct vnode *vp, struct v
 	struct lfs *fs = ip->i_lfs;
 	struct segment *sp = fs->lfs_sp;
 	UVMHIST_FUNC("lfs_gop_write"); UVMHIST_CALLED(ubchist);
+	const char * failreason = NULL;
 
 	ASSERT_SEGLOCK(fs);
 
@@ -1591,8 +1655,10 @@ lfs_gop_write(struct vnode *vp, struct v
 	 * We must write everything, however, if our vnode is being
 	 * reclaimed.
 	 */
-	if (LFS_STARVED_FOR_SEGS(fs) && vp != fs->lfs_flushvp)
-		goto tryagain;
+	if (LFS_STARVED_FOR_SEGS(fs) && !(vp->v_iflag & VI_XLOCK)) {
+		failreason = "Starved for segs and not flushing vp";
+ 		goto tryagain;
+	}
 
 	/*
 	 * Sometimes things slip past the filters in lfs_putpages,
@@ -1610,9 +1676,16 @@ lfs_gop_write(struct vnode *vp, struct v
 	 *
 	 * XXXUBC that last statement is an oversimplification of course.
 	 */
-	if (!LFS_SEGLOCK_HELD(fs) ||
-	    (ip->i_lfs_iflags & LFSI_NO_GOP_WRITE) ||
-	    (pgs[0]->offset & fs->lfs_bmask) != 0) {
+	if (!LFS_SEGLOCK_HELD(fs)) {
+		failreason = "Seglock not held";
+		goto tryagain;
+	}
+	if (ip->i_lfs_iflags & LFSI_NO_GOP_WRITE) {
+		failreason = "Inode with no_gop_write";
+		goto tryagain;
+	}
+	if ((pgs[0]->offset & fs->lfs_bmask) != 0) {
+		failreason = "Bad page offset";
 		goto tryagain;
 	}
 
@@ -1632,6 +1705,7 @@ lfs_gop_write(struct vnode *vp, struct v
 	KASSERT(eof >= 0);
 
 	if (startoffset >= eof) {
+		failreason = "Offset beyond EOF";
 		goto tryagain;
 	} else
 		bytes = MIN(npages << PAGE_SHIFT, eof - startoffset);
@@ -1646,9 +1720,11 @@ lfs_gop_write(struct vnode *vp, struct v
 			pgs[i]->flags &= ~PG_DELWRI;
 			pgs[i]->flags |= PG_PAGEOUT;
 			uvm_pageout_start(1);
+			mutex_enter(vp->v_interlock);
 			mutex_enter(&uvm_pageqlock);
 			uvm_pageunwire(pgs[i]);
 			mutex_exit(&uvm_pageqlock);
+			mutex_exit(vp->v_interlock);
 		}
 	}
 
@@ -1768,7 +1844,7 @@ lfs_gop_write(struct vnode *vp, struct v
 			nestiobuf_setup(mbp, bp, offset - pg->offset, iobytes);
 			/*
 			 * LFS doesn't like async I/O here, dies with
-			 * and assert in lfs_bwrite().  Is that assert
+			 * an assert in lfs_bwrite().  Is that assert
 			 * valid?  I retained non-async behaviour when
 			 * converted this to use nestiobuf --pooka
 			 */
@@ -1805,6 +1881,10 @@ lfs_gop_write(struct vnode *vp, struct v
 		lfs_flush(fs, 0, 1);
 		mutex_exit(&lfs_lock);
 	}
+
+	if ((sp->seg_flags & SEGM_SINGLE) && fs->lfs_curseg != fs->lfs_startseg)
+		return EAGAIN;
+
 	return (0);
 
     tryagain:
@@ -1815,18 +1895,13 @@ lfs_gop_write(struct vnode *vp, struct v
 	mutex_enter(vp->v_interlock);
 
 	/* Tell why we're here, if we know */
-	if (ip->i_lfs_iflags & LFSI_NO_GOP_WRITE) {
-		DLOG((DLOG_PAGE, "lfs_gop_write: clean pages dirtied\n"));
-	} else if ((pgs[0]->offset & fs->lfs_bmask) != 0) {
-		DLOG((DLOG_PAGE, "lfs_gop_write: not on block boundary\n"));
-	} else if (haveeof && startoffset >= eof) {
-		DLOG((DLOG_PAGE, "lfs_gop_write: ino %d start 0x%" PRIx64
-		      " eof 0x%" PRIx64 " npages=%d\n", VTOI(vp)->i_number,
-		      pgs[0]->offset, eof, npages));
-	} else if (LFS_STARVED_FOR_SEGS(fs)) {
-		DLOG((DLOG_PAGE, "lfs_gop_write: avail too low\n"));
-	} else {
-		DLOG((DLOG_PAGE, "lfs_gop_write: seglock not held\n"));
+	if (failreason != NULL) {
+		DLOG((DLOG_PAGE, "lfs_gop_write: %s\n", failreason));
+	}
+	if (haveeof && startoffset >= eof) {
+ 		DLOG((DLOG_PAGE, "lfs_gop_write: ino %d start 0x%" PRIx64
+ 		      " eof 0x%" PRIx64 " npages=%d\n", VTOI(vp)->i_number,
+ 		      pgs[0]->offset, eof, npages));
 	}
 
 	mutex_enter(&uvm_pageqlock);
@@ -1898,14 +1973,14 @@ lfs_vinit(struct mount *mp, struct vnode
 			    i == 0)
 				continue;
 			if (ip->i_ffs1_db[i] != 0) {
-inconsistent:
 				lfs_dump_dinode(ip->i_din.ffs1_din);
-				panic("inconsistent inode");
+				panic("inconsistent inode (direct)");
 			}
 		}
 		for ( ; i < NDADDR + NIADDR; i++) {
 			if (ip->i_ffs1_ib[i - NDADDR] != 0) {
-				goto inconsistent;
+				lfs_dump_dinode(ip->i_din.ffs1_din);
+				panic("inconsistent inode (indirect)");
 			}
 		}
 #endif /* DEBUG */

Index: src/sys/ufs/lfs/lfs_vnops.c
diff -u src/sys/ufs/lfs/lfs_vnops.c:1.238 src/sys/ufs/lfs/lfs_vnops.c:1.239
--- src/sys/ufs/lfs/lfs_vnops.c:1.238	Tue Sep 20 14:01:33 2011
+++ src/sys/ufs/lfs/lfs_vnops.c	Mon Jan  2 22:10:45 2012
@@ -1,4 +1,4 @@
-/*	$NetBSD: lfs_vnops.c,v 1.238 2011/09/20 14:01:33 chs Exp $	*/
+/*	$NetBSD: lfs_vnops.c,v 1.239 2012/01/02 22:10:45 perseant Exp $	*/
 
 /*-
  * Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
@@ -60,7 +60,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: lfs_vnops.c,v 1.238 2011/09/20 14:01:33 chs Exp $");
+__KERNEL_RCSID(0, "$NetBSD: lfs_vnops.c,v 1.239 2012/01/02 22:10:45 perseant Exp $");
 
 #ifdef _KERNEL_OPT
 #include "opt_compat_netbsd.h"
@@ -363,6 +363,17 @@ lfs_inactive(void *v)
 		return 0;
 	}
 
+#ifdef DEBUG
+	/*
+	 * This might happen on unmount.
+	 * XXX If it happens at any other time, it should be a panic.
+	 */
+	if (ap->a_vp->v_uflag & VU_DIROP) {
+		struct inode *ip = VTOI(ap->a_vp);
+		printf("lfs_inactive: inactivating VU_DIROP? ino = %d\n", (int)ip->i_number);
+	}
+#endif /* DIAGNOSTIC */
+
 	return ufs_inactive(v);
 }
 
@@ -438,7 +449,7 @@ lfs_set_dirop(struct vnode *dvp, struct 
 	}
 
 	++fs->lfs_dirops;
-	fs->lfs_doifile = 1;
+	/* fs->lfs_doifile = 1; */ /* XXX why? --ks */
 	mutex_exit(&lfs_lock);
 
 	/* Hold a reference so SET_ENDOP will be happy */
@@ -544,13 +555,15 @@ lfs_mark_vnode(struct vnode *vp)
 	if (!(ip->i_flag & IN_ADIROP)) {
 		if (!(vp->v_uflag & VU_DIROP)) {
 			mutex_enter(vp->v_interlock);
-			(void)lfs_vref(vp);
+			if (lfs_vref(vp) != 0)
+				panic("lfs_mark_vnode: could not vref");
 			++lfs_dirvcount;
 			++fs->lfs_dirvcount;
 			TAILQ_INSERT_TAIL(&fs->lfs_dchainhd, ip, i_lfs_dchain);
 			vp->v_uflag |= VU_DIROP;
 		}
 		++fs->lfs_nadirop;
+		ip->i_flag &= ~IN_CDIROP;
 		ip->i_flag |= IN_ADIROP;
 	} else
 		KASSERT(vp->v_uflag & VU_DIROP);
@@ -1153,7 +1166,8 @@ lfs_strategy(void *v)
 	struct vnode	*vp;
 	struct inode	*ip;
 	daddr_t		tbn;
-	int		i, sn, error, slept;
+#define MAXLOOP 25
+	int		i, sn, error, slept, loopcount;
 
 	bp = ap->a_bp;
 	vp = ap->a_vp;
@@ -1185,6 +1199,7 @@ lfs_strategy(void *v)
 	}
 
 	slept = 1;
+	loopcount = 0;
 	mutex_enter(&lfs_lock);
 	while (slept && fs->lfs_seglock) {
 		mutex_exit(&lfs_lock);
@@ -1213,12 +1228,19 @@ lfs_strategy(void *v)
 				      PRId64 "\n", ip->i_number, bp->b_lblkno));
 				mutex_enter(&lfs_lock);
 				if (LFS_SEGLOCK_HELD(fs) && fs->lfs_iocount) {
-					/* Cleaner can't wait for itself */
-					mtsleep(&fs->lfs_iocount,
-						(PRIBIO + 1) | PNORELOCK,
-						"clean2", 0,
-						&lfs_lock);
+					/*
+					 * Cleaner can't wait for itself.
+					 * Instead, wait for the blocks
+					 * to be written to disk.
+					 * XXX we need pribio in the test
+					 * XXX here.
+					 */
+ 					mtsleep(&fs->lfs_iocount,
+ 						(PRIBIO + 1) | PNORELOCK,
+						"clean2", hz/10 + 1,
+ 						&lfs_lock);
 					slept = 1;
+					++loopcount;
 					break;
 				} else if (fs->lfs_seglock) {
 					mtsleep(&fs->lfs_seglock,
@@ -1232,6 +1254,10 @@ lfs_strategy(void *v)
 			}
 		}
 		mutex_enter(&lfs_lock);
+		if (loopcount > MAXLOOP) {
+			printf("lfs_strategy: breaking out of clean2 loop\n");
+			break;
+		}
 	}
 	mutex_exit(&lfs_lock);
 
@@ -1240,37 +1266,39 @@ lfs_strategy(void *v)
 	return (0);
 }
 
-void
+/*
+ * Inline lfs_segwrite/lfs_writevnodes, but just for dirops.
+ * Technically this is a checkpoint (the on-disk state is valid)
+ * even though we are leaving out all the file data.
+ */
+int
 lfs_flush_dirops(struct lfs *fs)
 {
 	struct inode *ip, *nip;
 	struct vnode *vp;
 	extern int lfs_dostats;
 	struct segment *sp;
+	int flags = 0;
+	int error = 0;
 
 	ASSERT_MAYBE_SEGLOCK(fs);
 	KASSERT(fs->lfs_nadirop == 0);
 
 	if (fs->lfs_ronly)
-		return;
+		return EROFS;
 
 	mutex_enter(&lfs_lock);
 	if (TAILQ_FIRST(&fs->lfs_dchainhd) == NULL) {
 		mutex_exit(&lfs_lock);
-		return;
+		return 0;
 	} else
 		mutex_exit(&lfs_lock);
 
 	if (lfs_dostats)
 		++lfs_stats.flush_invoked;
 
-	/*
-	 * Inline lfs_segwrite/lfs_writevnodes, but just for dirops.
-	 * Technically this is a checkpoint (the on-disk state is valid)
-	 * even though we are leaving out all the file data.
-	 */
 	lfs_imtime(fs);
-	lfs_seglock(fs, SEGM_CKP);
+	lfs_seglock(fs, flags);
 	sp = fs->lfs_sp;
 
 	/*
@@ -1293,6 +1321,8 @@ lfs_flush_dirops(struct lfs *fs)
 		vp = ITOV(ip);
 
 		KASSERT((ip->i_flag & IN_ADIROP) == 0);
+		KASSERT(vp->v_uflag & VU_DIROP);
+		KASSERT(!(vp->v_iflag & VI_XLOCK));
 
 		/*
 		 * All writes to directories come from dirops; all
@@ -1300,9 +1330,7 @@ lfs_flush_dirops(struct lfs *fs)
 		 * cache, which we're not touching.  Reads to files
 		 * and/or directories will not be affected by writing
 		 * directory blocks inodes and file inodes.  So we don't
-		 * really need to lock.	 If we don't lock, though,
-		 * make sure that we don't clear IN_MODIFIED
-		 * unnecessarily.
+		 * really need to lock.
 		 */
 		if (vp->v_iflag & VI_XLOCK) {
 			mutex_enter(&lfs_lock);
@@ -1313,23 +1341,36 @@ lfs_flush_dirops(struct lfs *fs)
 		 */
 		if (vp->v_type != VREG &&
 		    ((ip->i_flag & IN_ALLMOD) || !VPISEMPTY(vp))) {
-			lfs_writefile(fs, sp, vp);
+			error = lfs_writefile(fs, sp, vp);
 			if (!VPISEMPTY(vp) && !WRITEINPROG(vp) &&
 			    !(ip->i_flag & IN_ALLMOD)) {
 			    	mutex_enter(&lfs_lock);
 				LFS_SET_UINO(ip, IN_MODIFIED);
 			    	mutex_exit(&lfs_lock);
 			}
+			if (error && (sp->seg_flags & SEGM_SINGLE)) {
+				mutex_enter(&lfs_lock);
+				error = EAGAIN;
+				break;
+			}
 		}
 		KDASSERT(ip->i_number != LFS_IFILE_INUM);
-		(void) lfs_writeinode(fs, sp, ip);
+		error = lfs_writeinode(fs, sp, ip);
 		mutex_enter(&lfs_lock);
+		if (error && (sp->seg_flags & SEGM_SINGLE)) {
+			error = EAGAIN;
+			break;
+		}
+
 		/*
-		 * XXX
-		 * LK_EXCLOTHER is dead -- what is intended here?
-		 * if (waslocked == LK_EXCLOTHER)
-		 *	LFS_SET_UINO(ip, IN_MODIFIED);
+		 * We might need to update these inodes again,
+		 * for example, if they have data blocks to write.
+		 * Make sure that after this flush, they are still
+		 * marked IN_MODIFIED so that we don't forget to
+		 * write them.
 		 */
+		/* XXX only for non-directories? --KS */
+		LFS_SET_UINO(ip, IN_MODIFIED);
 	}
 	mutex_exit(&lfs_lock);
 	/* We've written all the dirops there are */
@@ -1337,6 +1378,8 @@ lfs_flush_dirops(struct lfs *fs)
 	lfs_finalize_fs_seguse(fs);
 	(void) lfs_writeseg(fs, sp);
 	lfs_segunlock(fs);
+
+	return error;
 }
 
 /*
@@ -1346,29 +1389,30 @@ lfs_flush_dirops(struct lfs *fs)
  * for any reason, just skip it; if we have to wait for the cleaner,
  * abort.  The writer daemon will call us again later.
  */
-void
+int
 lfs_flush_pchain(struct lfs *fs)
 {
 	struct inode *ip, *nip;
 	struct vnode *vp;
 	extern int lfs_dostats;
 	struct segment *sp;
-	int error;
+	int error, error2;
 
 	ASSERT_NO_SEGLOCK(fs);
 
 	if (fs->lfs_ronly)
-		return;
+		return EROFS;
 
 	mutex_enter(&lfs_lock);
 	if (TAILQ_FIRST(&fs->lfs_pchainhd) == NULL) {
 		mutex_exit(&lfs_lock);
-		return;
+		return 0;
 	} else
 		mutex_exit(&lfs_lock);
 
 	/* Get dirops out of the way */
-	lfs_flush_dirops(fs);
+	if ((error = lfs_flush_dirops(fs)) != 0)
+		return error;
 
 	if (lfs_dostats)
 		++lfs_stats.flush_invoked;
@@ -1422,12 +1466,12 @@ lfs_flush_pchain(struct lfs *fs)
 		    	mutex_exit(&lfs_lock);
 		}
 		KDASSERT(ip->i_number != LFS_IFILE_INUM);
-		(void) lfs_writeinode(fs, sp, ip);
+		error2 = lfs_writeinode(fs, sp, ip);
 
 		VOP_UNLOCK(vp);
 		lfs_vunref(vp);
 
-		if (error == EAGAIN) {
+		if (error == EAGAIN || error2 == EAGAIN) {
 			lfs_writeseg(fs, sp);
 			mutex_enter(&lfs_lock);
 			break;
@@ -1437,6 +1481,8 @@ lfs_flush_pchain(struct lfs *fs)
 	mutex_exit(&lfs_lock);
 	(void) lfs_writeseg(fs, sp);
 	lfs_segunlock(fs);
+
+	return 0;
 }
 
 /*
@@ -1682,7 +1728,8 @@ segwait_common:
 		/* Wait for the log to wrap, if asked */
 		if (*(int *)ap->a_data) {
 			mutex_enter(ap->a_vp->v_interlock);
-			lfs_vref(ap->a_vp);
+			if (lfs_vref(ap->a_vp) != 0)
+				panic("LFCNWRAPPASS: lfs_vref failed");
 			VTOI(ap->a_vp)->i_lfs_iflags |= LFSI_WRAPWAIT;
 			log(LOG_NOTICE, "LFCNPASS waiting for log wrap\n");
 			error = mtsleep(&fs->lfs_nowrap, PCATCH | PUSER,
@@ -1746,6 +1793,7 @@ lfs_getpages(void *v)
 static void
 wait_for_page(struct vnode *vp, struct vm_page *pg, const char *label)
 {
+	KASSERT(mutex_owned(vp->v_interlock));
 	if ((pg->flags & PG_BUSY) == 0)
 		return;		/* Nothing to wait for! */
 
@@ -1786,6 +1834,7 @@ static void
 write_and_wait(struct lfs *fs, struct vnode *vp, struct vm_page *pg,
 	       int seglocked, const char *label)
 {
+	KASSERT(mutex_owned(vp->v_interlock));
 #ifndef BUSYWAIT
 	struct inode *ip = VTOI(vp);
 	struct segment *sp = fs->lfs_sp;
@@ -1814,12 +1863,15 @@ write_and_wait(struct lfs *fs, struct vn
 		mutex_enter(vp->v_interlock);
 		wait_for_page(vp, pg, label);
 	}
-	if (label != NULL && count > 1)
-		printf("lfs_putpages[%d]: %s: %sn = %d\n", curproc->p_pid,
-		       label, (count > 0 ? "looping, " : ""), count);
+	if (label != NULL && count > 1) {
+		DLOG((DLOG_PAGE, "lfs_putpages[%d]: %s: %sn = %d\n",
+		      curproc->p_pid, label, (count > 0 ? "looping, " : ""),
+		      count));
+	}
 #else
 	preempt(1);
 #endif
+	KASSERT(mutex_owned(vp->v_interlock));
 }
 
 /*
@@ -1849,6 +1901,7 @@ check_dirty(struct lfs *fs, struct vnode
 	int pages_per_block = fs->lfs_bsize >> PAGE_SHIFT;
 	int pagedaemon = (curlwp == uvm.pagedaemon_lwp);
 
+	KASSERT(mutex_owned(vp->v_interlock));
 	ASSERT_MAYBE_SEGLOCK(fs);
   top:
 	by_list = (vp->v_uobj.uo_npages <=
@@ -1891,6 +1944,7 @@ check_dirty(struct lfs *fs, struct vnode
 		 */
 		nonexistent = dirty = 0;
 		for (i = 0; i == 0 || i < pages_per_block; i++) {
+			KASSERT(mutex_owned(vp->v_interlock));
 			if (by_list && pages_per_block <= 1) {
 				pgs[i] = pg = curpg;
 			} else {
@@ -1916,13 +1970,16 @@ check_dirty(struct lfs *fs, struct vnode
 				DLOG((DLOG_PAGE, "lfs_putpages: avoiding 3-way or pagedaemon deadlock\n"));
 				if (pgp)
 					*pgp = pg;
+				KASSERT(mutex_owned(vp->v_interlock));
 				return -1;
 			}
 
 			while (pg->flags & PG_BUSY) {
 				wait_for_page(vp, pg, NULL);
+				KASSERT(mutex_owned(vp->v_interlock));
 				if (i > 0)
 					uvm_page_unbusy(pgs, i);
+				KASSERT(mutex_owned(vp->v_interlock));
 				goto top;
 			}
 			pg->flags |= PG_BUSY;
@@ -1944,6 +2001,7 @@ check_dirty(struct lfs *fs, struct vnode
 
 		any_dirty += dirty;
 		KASSERT(nonexistent == 0);
+		KASSERT(mutex_owned(vp->v_interlock));
 
 		/*
 		 * If any are dirty make all dirty; unbusy them,
@@ -1952,8 +2010,10 @@ check_dirty(struct lfs *fs, struct vnode
 		 * they're on their way to disk.
 		 */
 		for (i = 0; i == 0 || i < pages_per_block; i++) {
+			KASSERT(mutex_owned(vp->v_interlock));
 			pg = pgs[i];
 			KASSERT(!((pg->flags & PG_CLEAN) && (pg->flags & PG_DELWRI)));
+			KASSERT(pg->flags & PG_BUSY);
 			if (dirty) {
 				pg->flags &= ~PG_CLEAN;
 				if (flags & PGO_FREE) {
@@ -1985,6 +2045,7 @@ check_dirty(struct lfs *fs, struct vnode
 		}
 	}
 
+	KASSERT(mutex_owned(vp->v_interlock));
 	return any_dirty;
 }
 
@@ -2048,9 +2109,11 @@ lfs_putpages(void *v)
 	struct segment *sp;
 	off_t origoffset, startoffset, endoffset, origendoffset, blkeof;
 	off_t off, max_endoffset;
-	bool seglocked, sync, pagedaemon;
+	bool seglocked, sync, pagedaemon, reclaim;
 	struct vm_page *pg, *busypg;
 	UVMHIST_FUNC("lfs_putpages"); UVMHIST_CALLED(ubchist);
+	int oreclaim = 0;
+	int donewriting = 0;
 #ifdef DEBUG
 	int debug_n_again, debug_n_dirtyclean;
 #endif
@@ -2059,8 +2122,11 @@ lfs_putpages(void *v)
 	ip = VTOI(vp);
 	fs = ip->i_lfs;
 	sync = (ap->a_flags & PGO_SYNCIO) != 0;
+	reclaim = (ap->a_flags & PGO_RECLAIM) != 0;
 	pagedaemon = (curlwp == uvm.pagedaemon_lwp);
 
+	KASSERT(mutex_owned(vp->v_interlock));
+
 	/* Putpages does nothing for metadata. */
 	if (vp == fs->lfs_ivnode || vp->v_type != VREG) {
 		mutex_exit(vp->v_interlock);
@@ -2086,6 +2152,8 @@ lfs_putpages(void *v)
 			TAILQ_REMOVE(&fs->lfs_pchainhd, ip, i_lfs_pchain);
 		}
 		mutex_exit(&lfs_lock);
+
+		KASSERT(!mutex_owned(vp->v_interlock));
 		return 0;
 	}
 
@@ -2093,12 +2161,15 @@ lfs_putpages(void *v)
 
 	/*
 	 * Ignore requests to free pages past EOF but in the same block
-	 * as EOF, unless the request is synchronous.  (If the request is
-	 * sync, it comes from lfs_truncate.)
-	 * XXXUBC Make these pages look "active" so the pagedaemon won't
-	 * XXXUBC bother us with them again.
+	 * as EOF, unless the vnode is being reclaimed or the request
+	 * is synchronous.  (If the request is sync, it comes from
+	 * lfs_truncate.)
+	 *
+	 * To avoid being flooded with this request, make these pages
+	 * look "active".
 	 */
-	if (!sync && ap->a_offlo >= ip->i_size && ap->a_offlo < blkeof) {
+	if (!sync && !reclaim &&
+	    ap->a_offlo >= ip->i_size && ap->a_offlo < blkeof) {
 		origoffset = ap->a_offlo;
 		for (off = origoffset; off < blkeof; off += fs->lfs_bsize) {
 			pg = uvm_pagelookup(&vp->v_uobj, off);
@@ -2154,8 +2225,13 @@ lfs_putpages(void *v)
 	 * If not cleaning, just send the pages through genfs_putpages
 	 * to be returned to the pool.
 	 */
-	if (!(ap->a_flags & PGO_CLEANIT))
-		return genfs_putpages(v);
+	if (!(ap->a_flags & PGO_CLEANIT)) {
+		DLOG((DLOG_PAGE, "lfs_putpages: no cleanit vn %p ino %d (flags %x)\n",
+		      vp, (int)ip->i_number, ap->a_flags));
+		int r = genfs_putpages(v);
+		KASSERT(!mutex_owned(vp->v_interlock));
+		return r;
+	}
 
 	/* Set PGO_BUSYFAIL to avoid deadlocks */
 	ap->a_flags |= PGO_BUSYFAIL;
@@ -2169,6 +2245,7 @@ lfs_putpages(void *v)
 #endif
 	do {
 		int r;
+		KASSERT(mutex_owned(vp->v_interlock));
 
 		/* Count the number of dirty pages */
 		r = check_dirty(fs, vp, startoffset, endoffset, blkeof,
@@ -2191,8 +2268,10 @@ lfs_putpages(void *v)
 		r = genfs_do_putpages(vp, startoffset, endoffset,
 				       ap->a_flags & ~PGO_SYNCIO, &busypg);
 		ip->i_lfs_iflags &= ~LFSI_NO_GOP_WRITE;
-		if (r != EDEADLK)
-			return r;
+		if (r != EDEADLK) {
+			KASSERT(!mutex_owned(vp->v_interlock));
+ 			return r;
+		}
 
 		/* One of the pages was busy.  Start over. */
 		mutex_enter(vp->v_interlock);
@@ -2204,8 +2283,8 @@ lfs_putpages(void *v)
 
 #ifdef DEBUG
 	if (debug_n_dirtyclean > TOOMANY)
-		printf("lfs_putpages: dirtyclean: looping, n = %d\n",
-		       debug_n_dirtyclean);
+		DLOG((DLOG_PAGE, "lfs_putpages: dirtyclean: looping, n = %d\n",
+		      debug_n_dirtyclean));
 #endif
 
 	/*
@@ -2228,6 +2307,7 @@ lfs_putpages(void *v)
 		wakeup(&lfs_writer_daemon);
 		mutex_exit(&lfs_lock);
 		preempt();
+		KASSERT(!mutex_owned(vp->v_interlock));
 		return EWOULDBLOCK;
 	}
 
@@ -2239,26 +2319,28 @@ lfs_putpages(void *v)
 	 */
 	if ((ap->a_flags & (PGO_CLEANIT|PGO_LOCKED)) == PGO_CLEANIT &&
 	    (vp->v_uflag & VU_DIROP)) {
-		int locked;
-
 		DLOG((DLOG_PAGE, "lfs_putpages: flushing VU_DIROP\n"));
-		/* XXX VOP_ISLOCKED() may not be used for lock decisions. */
-		locked = (VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
+
+ 		lfs_writer_enter(fs, "ppdirop");
+
+		/* Note if we hold the vnode locked */
+		if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE)
+		{
+		    DLOG((DLOG_PAGE, "lfs_putpages: dirop inode already locked\n"));
+		} else {
+		    DLOG((DLOG_PAGE, "lfs_putpages: dirop inode not locked\n"));
+		}
 		mutex_exit(vp->v_interlock);
-		lfs_writer_enter(fs, "ppdirop");
-		if (locked)
-			VOP_UNLOCK(vp); /* XXX why? */
 
 		mutex_enter(&lfs_lock);
 		lfs_flush_fs(fs, sync ? SEGM_SYNC : 0);
 		mutex_exit(&lfs_lock);
 
-		if (locked)
-			VOP_LOCK(vp, LK_EXCLUSIVE);
 		mutex_enter(vp->v_interlock);
 		lfs_writer_leave(fs);
 
-		/* XXX the flush should have taken care of this one too! */
+		/* The flush will have cleaned out this vnode as well,
+		   no need to do more to it. */
 	}
 
 	/*
@@ -2286,8 +2368,10 @@ lfs_putpages(void *v)
 	if (!seglocked) {
 		mutex_exit(vp->v_interlock);
 		error = lfs_seglock(fs, SEGM_PROT | (sync ? SEGM_SYNC : 0));
-		if (error != 0)
-			return error;
+		if (error != 0) {
+			KASSERT(!mutex_owned(vp->v_interlock));
+ 			return error;
+		}
 		mutex_enter(vp->v_interlock);
 		lfs_acquire_finfo(fs, ip->i_number, ip->i_gen);
 	}
@@ -2295,6 +2379,12 @@ lfs_putpages(void *v)
 	KASSERT(sp->vp == NULL);
 	sp->vp = vp;
 
+	/* Note segments written by reclaim; only for debugging */
+	if ((vp->v_iflag & VI_XLOCK) != 0) {
+		sp->seg_flags |= SEGM_RECLAIM;
+		fs->lfs_reclino = ip->i_number;
+	}
+
 	/*
 	 * Ensure that the partial segment is marked SS_DIROP if this
 	 * vnode is a DIROP.
@@ -2313,10 +2403,11 @@ lfs_putpages(void *v)
 #endif
 	do {
 		busypg = NULL;
+		KASSERT(mutex_owned(vp->v_interlock));
 		if (check_dirty(fs, vp, startoffset, endoffset, blkeof,
 				ap->a_flags, 0, &busypg) < 0) {
 			mutex_exit(vp->v_interlock);
-
+			/* XXX why? --ks */
 			mutex_enter(vp->v_interlock);
 			write_and_wait(fs, vp, busypg, seglocked, NULL);
 			if (!seglocked) {
@@ -2330,8 +2421,12 @@ lfs_putpages(void *v)
 		}
 	
 		busypg = NULL;
+		KASSERT(!mutex_owned(&uvm_pageqlock));
+		oreclaim = (ap->a_flags & PGO_RECLAIM);
+		ap->a_flags &= ~PGO_RECLAIM;
 		error = genfs_do_putpages(vp, startoffset, endoffset,
 					   ap->a_flags, &busypg);
+		ap->a_flags |= oreclaim;
 	
 		if (error == EDEADLK || error == EAGAIN) {
 			DLOG((DLOG_PAGE, "lfs_putpages: genfs_putpages returned"
@@ -2339,20 +2434,40 @@ lfs_putpages(void *v)
 			      ip->i_number, fs->lfs_offset,
 			      dtosn(fs, fs->lfs_offset)));
 
-			mutex_enter(vp->v_interlock);
-			write_and_wait(fs, vp, busypg, seglocked, "again");
+			if (oreclaim) {
+				mutex_enter(vp->v_interlock);
+				write_and_wait(fs, vp, busypg, seglocked, "again");
+				mutex_exit(vp->v_interlock);
+			} else {
+				if ((sp->seg_flags & SEGM_SINGLE) &&
+				    fs->lfs_curseg != fs->lfs_startseg)
+					donewriting = 1;
+			}
+		} else if (error) {
+			DLOG((DLOG_PAGE, "lfs_putpages: genfs_putpages returned"
+			      " %d ino %d off %x (seg %d)\n", error,
+			      (int)ip->i_number, fs->lfs_offset,
+			      dtosn(fs, fs->lfs_offset)));
 		}
+		/* genfs_do_putpages loses the interlock */
 #ifdef DEBUG
 		++debug_n_again;
 #endif
-	} while (error == EDEADLK);
+		if (oreclaim && error == EAGAIN) {
+			DLOG((DLOG_PAGE, "vp %p ino %d vi_flags %x a_flags %x avoiding vclean panic\n",
+			      vp, (int)ip->i_number, vp->v_iflag, ap->a_flags));
+			mutex_enter(vp->v_interlock);
+		}
+		if (error == EDEADLK)
+			mutex_enter(vp->v_interlock);
+	} while (error == EDEADLK || (oreclaim && error == EAGAIN));
 #ifdef DEBUG
 	if (debug_n_again > TOOMANY)
-		printf("lfs_putpages: again: looping, n = %d\n", debug_n_again);
+		DLOG((DLOG_PAGE, "lfs_putpages: again: looping, n = %d\n", debug_n_again));
 #endif
 
 	KASSERT(sp != NULL && sp->vp == vp);
-	if (!seglocked) {
+	if (!seglocked && !donewriting) {
 		sp->vp = NULL;
 
 		/* Write indirect blocks as well */
@@ -2376,8 +2491,10 @@ lfs_putpages(void *v)
 	 * If we were called from lfs_writefile, we don't need to clean up
 	 * the FIP or unlock the segment lock.	We're done.
 	 */
-	if (seglocked)
+	if (seglocked) {
+		KASSERT(!mutex_owned(vp->v_interlock));
 		return error;
+	}
 
 	/* Clean up FIP and send it to disk. */
 	lfs_release_finfo(fs);
@@ -2417,6 +2534,7 @@ lfs_putpages(void *v)
 		}
 		mutex_exit(vp->v_interlock);
 	}
+	KASSERT(!mutex_owned(vp->v_interlock));
 	return error;
 }
 

Index: src/sys/ufs/ufs/inode.h
diff -u src/sys/ufs/ufs/inode.h:1.58 src/sys/ufs/ufs/inode.h:1.59
--- src/sys/ufs/ufs/inode.h:1.58	Tue Jul 12 02:22:13 2011
+++ src/sys/ufs/ufs/inode.h	Mon Jan  2 22:10:45 2012
@@ -1,4 +1,4 @@
-/*	$NetBSD: inode.h,v 1.58 2011/07/12 02:22:13 dholland Exp $	*/
+/*	$NetBSD: inode.h,v 1.59 2012/01/02 22:10:45 perseant Exp $	*/
 
 /*
  * Copyright (c) 1982, 1989, 1993
@@ -242,7 +242,7 @@ struct inode {
 #define	IN_ADIROP	0x0200		/* LFS: dirop in progress */
 #define	IN_SPACECOUNTED	0x0400		/* Blocks to be freed in free count. */
 #define	IN_PAGING       0x1000		/* LFS: file is on paging queue */
-
+#define IN_CDIROP       0x4000          /* LFS: dirop completed pending i/o */
 #if defined(_KERNEL)
 
 /*

Index: src/sys/ufs/ufs/ufs_readwrite.c
diff -u src/sys/ufs/ufs/ufs_readwrite.c:1.100 src/sys/ufs/ufs/ufs_readwrite.c:1.101
--- src/sys/ufs/ufs/ufs_readwrite.c:1.100	Fri Nov 18 21:18:52 2011
+++ src/sys/ufs/ufs/ufs_readwrite.c	Mon Jan  2 22:10:45 2012
@@ -1,4 +1,4 @@
-/*	$NetBSD: ufs_readwrite.c,v 1.100 2011/11/18 21:18:52 christos Exp $	*/
+/*	$NetBSD: ufs_readwrite.c,v 1.101 2012/01/02 22:10:45 perseant Exp $	*/
 
 /*-
  * Copyright (c) 1993
@@ -32,7 +32,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(1, "$NetBSD: ufs_readwrite.c,v 1.100 2011/11/18 21:18:52 christos Exp $");
+__KERNEL_RCSID(1, "$NetBSD: ufs_readwrite.c,v 1.101 2012/01/02 22:10:45 perseant Exp $");
 
 #ifdef LFS_READWRITE
 #define	FS			struct lfs
@@ -294,6 +294,7 @@ WRITE(void *v)
 
 #ifdef LFS_READWRITE
 	async = true;
+	lfs_availwait(fs, btofsb(fs, uio->uio_resid));
 	lfs_check(vp, LFS_UNUSED_LBN, 0);
 #endif /* !LFS_READWRITE */
 	if (!usepc)

Reply via email to