This is adding recovery on raid5/6 log.

We've set a %journal_tail in super_block, which indicates the position
from where we need to replay data.  So we scan the log and replay
valid meta/data/parity pairs until finding an invalid one.  By
replaying, it simply reads data/parity from the raid5/6 log and issues
writes to the raid disks where it should be.  Please note that the
whole meta/data/parity pair can be discarded if it fails the sanity
check in the meta block.

After recovery, we also append an empty meta block and update the
%journal_tail in super_block in order to avoid a situation, where the
layout on the raid5/6 log is

[valid A][invalid B][valid C],

so block A is the only one we should replay.

Then the recovery ends up pointing to block A as block B is invalid,
and some new writes come in and append to block A so that block B is
now overwritten to be a valid meta/data/parity.  If a power loss
happens, the new recovery starts again from block A, and since block B
is now valid, it may replay block C as well which has become stale.

Signed-off-by: Liu Bo <bo.li....@oracle.com>
---
 fs/btrfs/raid56.c | 151 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 151 insertions(+)

diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 5d7ea235..dea33c4 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1530,10 +1530,161 @@ static int btrfs_r5l_write_empty_meta_block(struct 
btrfs_r5l_log *log, u64 pos,
        return ret;
 }
 
+struct btrfs_r5l_recover_ctx {
+       u64 pos;
+       u64 seq;
+       u64 total_size;
+       struct page *meta_page;
+       struct page *io_page;
+};
+
+static int btrfs_r5l_recover_load_meta(struct btrfs_r5l_log *log, struct 
btrfs_r5l_recover_ctx *ctx)
+{
+       struct btrfs_r5l_meta_block *mb;
+
+       btrfs_r5l_sync_page_io(log, log->dev, (ctx->pos >> 9), PAGE_SIZE, 
ctx->meta_page, REQ_OP_READ);
+
+       mb = kmap(ctx->meta_page);
+#ifdef BTRFS_DEBUG_R5LOG
+       trace_printk("ctx->pos %llu ctx->seq %llu pos %llu seq %llu\n", 
ctx->pos, ctx->seq, le64_to_cpu(mb->position), le64_to_cpu(mb->seq));
+#endif
+
+       if (le32_to_cpu(mb->magic) != BTRFS_R5LOG_MAGIC ||
+           le64_to_cpu(mb->position) != ctx->pos ||
+           le64_to_cpu(mb->seq) != ctx->seq) {
+#ifdef BTRFS_DEBUG_R5LOG
+               trace_printk("%s: mismatch magic %llu default %llu\n", 
__func__, le32_to_cpu(mb->magic), BTRFS_R5LOG_MAGIC);
+#endif
+               return -EINVAL;
+       }
+
+       ASSERT(le32_to_cpu(mb->meta_size) <= PAGE_SIZE);
+       kunmap(ctx->meta_page);
+
+       /* meta_block */
+       ctx->total_size = PAGE_SIZE;
+
+       return 0;
+}
+
+static int btrfs_r5l_recover_load_data(struct btrfs_r5l_log *log, struct 
btrfs_r5l_recover_ctx *ctx)
+{
+       u64 offset;
+       struct btrfs_r5l_meta_block *mb;
+       u64 meta_size;
+       u64 io_offset;
+       struct btrfs_device *dev;
+
+       mb = kmap(ctx->meta_page);
+
+       io_offset = PAGE_SIZE;
+       offset = sizeof(struct btrfs_r5l_meta_block);
+       meta_size = le32_to_cpu(mb->meta_size);
+
+       while (offset < meta_size) {
+               struct btrfs_r5l_payload *payload = (void *)mb + offset;
+
+               /* read data from log disk and write to payload->location */
+#ifdef BTRFS_DEBUG_R5LOG
+               trace_printk("payload type %d flags %d size %d location 0x%llx 
devid %llu\n", le16_to_cpu(payload->type), le16_to_cpu(payload->flags), 
le32_to_cpu(payload->size), le64_to_cpu(payload->location), 
le64_to_cpu(payload->devid));
+#endif
+
+               dev = btrfs_find_device(log->fs_info, 
le64_to_cpu(payload->devid), NULL, NULL);
+               if (!dev || dev->missing) {
+                       ASSERT(0);
+               }
+
+               if (le16_to_cpu(payload->type) == R5LOG_PAYLOAD_DATA) {
+                       ASSERT(le32_to_cpu(payload->size) == 1);
+                       btrfs_r5l_sync_page_io(log, log->dev, (ctx->pos + 
io_offset) >> 9, PAGE_SIZE, ctx->io_page, REQ_OP_READ);
+                       btrfs_r5l_sync_page_io(log, dev, 
le64_to_cpu(payload->location) >> 9, PAGE_SIZE, ctx->io_page, REQ_OP_WRITE);
+                       io_offset += PAGE_SIZE;
+               } else if (le16_to_cpu(payload->type) == R5LOG_PAYLOAD_PARITY) {
+                       int i;
+                       ASSERT(le32_to_cpu(payload->size) == 16);
+                       for (i = 0; i < le32_to_cpu(payload->size); i++) {
+                               /* liubo: parity are guaranteed to be
+                                * contiguous, use just one bio to
+                                * hold all pages and flush them. */
+                               u64 parity_off = le64_to_cpu(payload->location) 
+ i * PAGE_SIZE;
+                               btrfs_r5l_sync_page_io(log, log->dev, (ctx->pos 
+ io_offset) >> 9, PAGE_SIZE, ctx->io_page, REQ_OP_READ);
+                               btrfs_r5l_sync_page_io(log, dev, parity_off >> 
9, PAGE_SIZE, ctx->io_page, REQ_OP_WRITE);
+                               io_offset += PAGE_SIZE;
+                       }
+               } else {
+                       ASSERT(0);
+               }
+
+               offset += sizeof(struct btrfs_r5l_payload);
+       }
+       kunmap(ctx->meta_page);
+
+       ctx->total_size += (io_offset - PAGE_SIZE);
+       return 0;
+}
+
+static int btrfs_r5l_recover_flush_log(struct btrfs_r5l_log *log, struct 
btrfs_r5l_recover_ctx *ctx)
+{
+       int ret;
+
+       while (1) {
+               ret = btrfs_r5l_recover_load_meta(log, ctx);
+               if (ret)
+                       break;
+
+               ret = btrfs_r5l_recover_load_data(log, ctx);
+               ASSERT(!ret || ret > 0);
+               if (ret)
+                       break;
+
+               ctx->seq++;
+               ctx->pos = btrfs_r5l_ring_add(log, ctx->pos, ctx->total_size);
+       }
+
+       return ret;
+}
+
 static void btrfs_r5l_write_super(struct btrfs_fs_info *fs_info, u64 cp);
 
 static int btrfs_r5l_recover_log(struct btrfs_r5l_log *log)
 {
+       struct btrfs_r5l_recover_ctx *ctx;
+       u64 pos;
+       int ret;
+
+       ctx = kzalloc(sizeof(*ctx), GFP_NOFS);
+       ASSERT(ctx);
+
+       ctx->pos = log->last_checkpoint;
+       ctx->seq = log->last_cp_seq;
+       ctx->meta_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+       ASSERT(ctx->meta_page);
+       ctx->io_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+       ASSERT(ctx->io_page);
+
+       ret = btrfs_r5l_recover_flush_log(log, ctx);
+       if (ret) {
+               ;
+       }
+
+       pos = ctx->pos;
+       log->next_checkpoint = ctx->pos;
+       ctx->seq += 10000;
+       btrfs_r5l_write_empty_meta_block(log, ctx->pos, ctx->seq++);
+       ctx->pos = btrfs_r5l_ring_add(log, ctx->pos, PAGE_SIZE);
+
+       log->log_start = ctx->pos;
+       log->seq = ctx->seq;
+       /* last_checkpoint point to the empty block. */
+       log->last_checkpoint = pos;
+       btrfs_r5l_write_super(log->fs_info, pos);
+
+#ifdef BTRFS_DEBUG_R5LOG
+       trace_printk("%s: log_start %llu seq %llu\n", __func__, log->log_start, 
log->seq);
+#endif
+       __free_page(ctx->meta_page);
+       __free_page(ctx->io_page);
+       kfree(ctx);
        return 0;
 }
 
-- 
2.9.4

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to