Author: jeff
Date: Thu Nov  8 01:41:04 2012
New Revision: 242734
URL: http://svnweb.freebsd.org/changeset/base/242734

Log:
   - Implement BIO_FLUSH support around journal entries.  This will not 100%
     solve power loss problems with dishonest write caches.  However, it
     should improve the situation and force a full fsck when it is unable
     to resolve with the journal.
   - Resolve a case where the journal could wrap in an unsafe way causing
     us to prematurely lose journal entries in very specific scenarios.
  
  Discussed with:       mckusick
  MFC after:    1 month

Modified:
  head/sys/ufs/ffs/ffs_softdep.c

Modified: head/sys/ufs/ffs/ffs_softdep.c
==============================================================================
--- head/sys/ufs/ffs/ffs_softdep.c      Thu Nov  8 01:38:30 2012        
(r242733)
+++ head/sys/ufs/ffs/ffs_softdep.c      Thu Nov  8 01:41:04 2012        
(r242734)
@@ -88,6 +88,8 @@ __FBSDID("$FreeBSD$");
 #include <vm/vm_extern.h>
 #include <vm/vm_object.h>
 
+#include <geom/geom.h>
+
 #include <ddb/ddb.h>
 
 #ifndef SOFTUPDATES
@@ -802,6 +804,7 @@ static      void handle_written_jnewblk(struc
 static void handle_written_jblkdep(struct jblkdep *);
 static void handle_written_jfreefrag(struct jfreefrag *);
 static void complete_jseg(struct jseg *);
+static void complete_jsegs(struct jseg *);
 static void jseg_write(struct ufsmount *ump, struct jseg *, uint8_t *);
 static void jaddref_write(struct jaddref *, struct jseg *, uint8_t *);
 static void jremref_write(struct jremref *, struct jseg *, uint8_t *);
@@ -1227,6 +1230,7 @@ static struct callout softdep_callout;
 static int req_pending;
 static int req_clear_inodedeps;        /* syncer process flush some inodedeps 
*/
 static int req_clear_remove;   /* syncer process flush some freeblks */
+static int softdep_flushcache = 0; /* Should we do BIO_FLUSH? */
 
 /*
  * runtime statistics
@@ -1310,6 +1314,8 @@ SYSCTL_INT(_debug_softdep, OID_AUTO, cle
     &stat_cleanup_retries, 0, "");
 SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_failures, CTLFLAG_RW,
     &stat_cleanup_failures, 0, "");
+SYSCTL_INT(_debug_softdep, OID_AUTO, flushcache, CTLFLAG_RW,
+    &softdep_flushcache, 0, "");
 
 SYSCTL_DECL(_vfs_ffs);
 
@@ -3078,6 +3084,67 @@ softdep_flushjournal(mp)
        FREE_LOCK(&lk);
 }
 
+static void softdep_synchronize_completed(struct bio *);
+static void softdep_synchronize(struct bio *, struct ufsmount *, void *);
+
+static void
+softdep_synchronize_completed(bp)
+        struct bio *bp;
+{
+       struct jseg *oldest;
+       struct jseg *jseg;
+
+       /*
+        * caller1 marks the last segment written before we issued the
+        * synchronize cache.
+        */
+       jseg = bp->bio_caller1;
+       oldest = NULL;
+       ACQUIRE_LOCK(&lk);
+       /*
+        * Mark all the journal entries waiting on the synchronize cache
+        * as completed so they may continue on.
+        */
+       while (jseg != NULL && (jseg->js_state & COMPLETE) == 0) {
+               jseg->js_state |= COMPLETE;
+               oldest = jseg;
+               jseg = TAILQ_PREV(jseg, jseglst, js_next);
+       }
+       /*
+        * Restart deferred journal entry processing from the oldest
+        * completed jseg.
+        */
+       if (oldest)
+               complete_jsegs(oldest);
+
+       FREE_LOCK(&lk);
+       g_destroy_bio(bp);
+}
+
+/*
+ * Send BIO_FLUSH/SYNCHRONIZE CACHE to the device to enforce write ordering
+ * barriers.  The journal must be written prior to any blocks that depend
+ * on it and the journal can not be released until the blocks have be
+ * written.  This code handles both barriers simultaneously.
+ */
+static void
+softdep_synchronize(bp, ump, caller1)
+       struct bio *bp;
+       struct ufsmount *ump;
+       void *caller1;
+{
+
+       bp->bio_cmd = BIO_FLUSH;
+       bp->bio_flags |= BIO_ORDERED;
+       bp->bio_data = NULL;
+       bp->bio_offset = ump->um_cp->provider->mediasize;
+       bp->bio_length = 0;
+       bp->bio_done = softdep_synchronize_completed;
+       bp->bio_caller1 = caller1;
+       g_io_request(bp,
+           (struct g_consumer *)ump->um_devvp->v_bufobj.bo_private);
+}
+
 /*
  * Flush some journal records to disk.
  */
@@ -3092,8 +3159,10 @@ softdep_process_journal(mp, needwk, flag
        struct worklist *wk;
        struct jseg *jseg;
        struct buf *bp;
+       struct bio *bio;
        uint8_t *data;
        struct fs *fs;
+       int shouldflush;
        int segwritten;
        int jrecmin;    /* Minimum records per block. */
        int jrecmax;    /* Maximum records per block. */
@@ -3104,6 +3173,9 @@ softdep_process_journal(mp, needwk, flag
 
        if (MOUNTEDSUJ(mp) == 0)
                return;
+       shouldflush = softdep_flushcache;
+       bio = NULL;
+       jseg = NULL;
        ump = VFSTOUFS(mp);
        fs = ump->um_fs;
        jblocks = ump->softdep_jblocks;
@@ -3152,6 +3224,10 @@ softdep_process_journal(mp, needwk, flag
                LIST_INIT(&jseg->js_entries);
                LIST_INIT(&jseg->js_indirs);
                jseg->js_state = ATTACHED;
+               if (shouldflush == 0)
+                       jseg->js_state |= COMPLETE;
+               else if (bio == NULL)
+                       bio = g_alloc_bio();
                jseg->js_jblocks = jblocks;
                bp = geteblk(fs->fs_bsize, 0);
                ACQUIRE_LOCK(&lk);
@@ -3284,6 +3360,17 @@ softdep_process_journal(mp, needwk, flag
                ACQUIRE_LOCK(&lk);
        }
        /*
+        * If we wrote a segment issue a synchronize cache so the journal
+        * is reflected on disk before the data is written.  Since reclaiming
+        * journal space also requires writing a journal record this
+        * process also enforces a barrier before reclamation.
+        */
+       if (segwritten && shouldflush) {
+               softdep_synchronize(bio, ump, 
+                   TAILQ_LAST(&jblocks->jb_segs, jseglst));
+       } else if (bio)
+               g_destroy_bio(bio);
+       /*
         * If we've suspended the filesystem because we ran out of journal
         * space either try to sync it here to make some progress or
         * unsuspend it if we already have.
@@ -3366,25 +3453,17 @@ complete_jseg(jseg)
 }
 
 /*
- * Mark a jseg as DEPCOMPLETE and throw away the buffer.  Handle jseg
- * completions in order only.
+ * Determine which jsegs are ready for completion processing.  Waits for
+ * synchronize cache to complete as well as forcing in-order completion
+ * of journal entries.
  */
 static void
-handle_written_jseg(jseg, bp)
+complete_jsegs(jseg)
        struct jseg *jseg;
-       struct buf *bp;
 {
        struct jblocks *jblocks;
        struct jseg *jsegn;
 
-       if (jseg->js_refs == 0)
-               panic("handle_written_jseg: No self-reference on %p", jseg);
-       jseg->js_state |= DEPCOMPLETE;
-       /*
-        * We'll never need this buffer again, set flags so it will be
-        * discarded.
-        */
-       bp->b_flags |= B_INVAL | B_NOCACHE;
        jblocks = jseg->js_jblocks;
        /*
         * Don't allow out of order completions.  If this isn't the first
@@ -3393,12 +3472,12 @@ handle_written_jseg(jseg, bp)
        if (jseg != jblocks->jb_writeseg)
                return;
        /* Iterate through available jsegs processing their entries. */
-       do {
+       while (jseg && (jseg->js_state & ALLCOMPLETE) == ALLCOMPLETE) {
                jblocks->jb_oldestwrseq = jseg->js_oldseq;
                jsegn = TAILQ_NEXT(jseg, js_next);
                complete_jseg(jseg);
                jseg = jsegn;
-       } while (jseg && jseg->js_state & DEPCOMPLETE);
+       }
        jblocks->jb_writeseg = jseg;
        /*
         * Attempt to free jsegs now that oldestwrseq may have advanced. 
@@ -3406,6 +3485,27 @@ handle_written_jseg(jseg, bp)
        free_jsegs(jblocks);
 }
 
+/*
+ * Mark a jseg as DEPCOMPLETE and throw away the buffer.  Attempt to handle
+ * the final completions.
+ */
+static void
+handle_written_jseg(jseg, bp)
+       struct jseg *jseg;
+       struct buf *bp;
+{
+
+       if (jseg->js_refs == 0)
+               panic("handle_written_jseg: No self-reference on %p", jseg);
+       jseg->js_state |= DEPCOMPLETE;
+       /*
+        * We'll never need this buffer again, set flags so it will be
+        * discarded.
+        */
+       bp->b_flags |= B_INVAL | B_NOCACHE;
+       complete_jsegs(jseg);
+}
+
 static inline struct jsegdep *
 inoref_jseg(inoref)
        struct inoref *inoref;
@@ -4191,8 +4291,13 @@ free_jsegs(jblocks)
                        jblocks->jb_oldestseg = jseg;
                        return;
                }
-               if (!LIST_EMPTY(&jseg->js_indirs) &&
-                   jseg->js_seq >= jblocks->jb_oldestwrseq)
+               if (jseg->js_seq > jblocks->jb_oldestwrseq)
+                       break;
+               /*
+                * We can free jsegs that didn't write entries when
+                * oldestwrseq == js_seq.
+                */
+               if (jseg->js_cnt != 0)
                        break;
                free_jseg(jseg, jblocks);
        }
_______________________________________________
svn-src-head@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"

Reply via email to