Hi! BEWARE: Don't use the following patch in production, it might eat your RAID set for breakfest. Attached is a patch which does 4 things: - tries to solve cleanly the RAID superblock issues on non-x86 architectures (where sizeof(md_super_t) was bigger than 4096, usually 4104) by introducing RAID 0.91.x on-disk format which is binary compatible with the 0.90 for i386 (and at the same time changes it from native-endian to little-endian). - introduces reserved-bytes setting in raidtab, for which the default is auto-probed by mkraid if not specified. If non-zero, the RAID array will make sure first reserved_bytes on the disk are never touched (resynced or whatever). This makes it possible e.g. to place RAID partition to cylinder 0 on a disk with Sun partition table. - in raid1.c raid1_kmalloc allocated a wrong size The patch is against 2.2.14 with 2.2.14-B1 RAID patch, because 2.3.99-pre2 is missing the raid1/5 bits. I can make try to port the remaining files changes to 2.3.99-pre2 though. - raidtab.5 man page fix I'm looking for testers both on x86 and non-x86. Cheers, Jakub ___________________________________________________________________ Jakub Jelinek | [EMAIL PROTECTED] | http://sunsite.mff.cuni.cz/~jj Linux version 2.3.99-pre2 on a sparc64 machine (1343.49 BogoMips) ___________________________________________________________________
--- linux/arch/sparc64/kernel/ioctl32.c.jj Mon Jan 24 11:36:41 2000 +++ linux/arch/sparc64/kernel/ioctl32.c Fri Mar 10 17:32:37 2000 @@ -2022,12 +2022,14 @@ asmlinkage int sys32_ioctl(unsigned int /* 0x09 */ case /* RAID_VERSION */ _IOR (MD_MAJOR, 0x10, char[12]): - case /* GET_ARRAY_INFO */ _IOR (MD_MAJOR, 0x11, char[72]): + case /* GET_ARRAY_INFO */ _IOR (MD_MAJOR, 0x11, char[128]): + case /* OLD_GET_ARRAY_INFO */ _IOR (MD_MAJOR, 0x11, char[72]): case /* GET_DISK_INFO */ _IOR (MD_MAJOR, 0x12, char[20]): case /* CLEAR_ARRAY */ _IO (MD_MAJOR, 0x20): case /* ADD_NEW_DISK */ _IOW (MD_MAJOR, 0x21, char[20]): case /* HOT_REMOVE_DISK */ _IO (MD_MAJOR, 0x22): - case /* SET_ARRAY_INFO */ _IOW (MD_MAJOR, 0x23, char[72]): + case /* SET_ARRAY_INFO */ _IOW (MD_MAJOR, 0x23, char[128]): + case /* OLD_SET_ARRAY_INFO */ _IOW (MD_MAJOR, 0x23, char[72]): case /* SET_DISK_INFO */ _IO (MD_MAJOR, 0x24): case /* WRITE_RAID_INFO */ _IO (MD_MAJOR, 0x25): case /* UNPROTECT_ARRAY */ _IO (MD_MAJOR, 0x26): --- linux/drivers/block/md.c.jj Mon Jan 24 11:36:42 2000 +++ linux/drivers/block/md.c Thu Mar 16 14:29:46 2000 @@ -11,6 +11,8 @@ - kerneld support by Boris Tobotras <[EMAIL PROTECTED]> - kmod support by: Cyrus Durgin - RAID0 bugfixes: Mark Anthony Lisher <[EMAIL PROTECTED]> + - superblock layout on non-x86 fixes and reserved_bytes support by + Jakub Jelinek <[EMAIL PROTECTED]> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -299,9 +301,18 @@ static unsigned int calc_dev_sboffset (k return size; } +static inline unsigned int calc_dev_reserved (mddev_t *mddev) +{ + unsigned int reserved = mddev->sb->reserved_bytes; + + reserved += mddev->sb->chunk_size - 1; + reserved &= ~(mddev->sb->chunk_size - 1); + return reserved / 1024; +} + static unsigned int calc_dev_size (kdev_t dev, mddev_t *mddev, int persistent) { - unsigned int size; + unsigned int size, reserved; size = calc_dev_sboffset(dev, mddev, persistent); if (!mddev->sb) { @@ -310,9 +321,25 @@ static unsigned int calc_dev_size (kdev_ } if (mddev->sb->chunk_size) size &= ~(mddev->sb->chunk_size/1024 - 1); + reserved = calc_dev_reserved (mddev); + if (reserved > size) + size = 0; + else + size -= reserved; return size; } +__u64 __inline__ md_read_events (mdp_super_t *sb) +{ + return (((__u64)sb->eventshi) << 32) | sb->eventslo; +} + +void __inline__ md_write_events (__u64 events, mdp_super_t *sb) +{ + sb->eventshi = events >> 32; + sb->eventslo = events; +} + /* * We check wether all devices are numbered from 0 to nb_dev-1. The * order is guaranteed even after device name changes. @@ -376,28 +403,13 @@ abort: return 1; } -static unsigned int zoned_raid_size (mddev_t *mddev) +static inline unsigned int zoned_raid_size (mddev_t *mddev) { - unsigned int mask; mdk_rdev_t * rdev; struct md_list_head *tmp; - if (!mddev->sb) { - MD_BUG(); - return -EINVAL; - } - /* - * do size and offset calculations. - */ - mask = ~(mddev->sb->chunk_size/1024 - 1); -printk("mask %08x\n", mask); - ITERATE_RDEV(mddev,rdev,tmp) { -printk(" rdev->size: %d\n", rdev->size); - rdev->size &= mask; -printk(" masked rdev->size: %d\n", rdev->size); md_size[mdidx(mddev)] += rdev->size; -printk(" new md_size: %d\n", md_size[mdidx(mddev)]); } return 0; } @@ -492,6 +504,17 @@ static void mark_rdev_faulty (mdk_rdev_t restore_flags(flags); } +static unsigned int calc_sb_csum (mdp_super_t * sb) +{ + unsigned int disk_csum, csum; + + disk_csum = sb->sb_csum; + sb->sb_csum = 0; + csum = csum_partial((void *)sb, MD_SB_BYTES, 0); + sb->sb_csum = disk_csum; + return csum; +} + static int read_disk_sb (mdk_rdev_t * rdev) { int ret = -EINVAL; @@ -518,13 +541,33 @@ static int read_disk_sb (mdk_rdev_t * rd bh = bread (dev, sb_offset / MD_SB_BLOCKS, MD_SB_BYTES); if (bh) { - sb = (mdp_super_t *) bh->b_data; - memcpy (rdev->sb, sb, MD_SB_BYTES); + sb = rdev->sb; + memcpy (sb, (mdp_super_t *) bh->b_data, MD_SB_BYTES); + if (sb->md_magic != MD_SB_MAGIC && + sb->md_magic == cpu_to_le32(MD_SB_MAGIC) && + (le32_to_cpu(sb->major_version) > 0 || + le32_to_cpu(sb->minor_version) > 90)) { + int i; + u32 *sbp = (u32 *) sb; + + for (i = 0; i < MD_SB_WORDS; i++, sbp++) + le32_to_cpus(sbp); + } + + rdev->csum_valid = calc_sb_csum(sb) == sb->sb_csum; + + if (sb->major_version == 0 && sb->minor_version <= 90 && + sizeof(mdp_old_super_t) >= sizeof(mdp_super_t)) { + /* Uh oh, 64bit events member moved half of the superblock */ + md_write_events(get_unaligned(&((mdp_old_super_t +*)sb)->events), sb); + memmove(sb->gstate_sreserved, ((mdp_old_super_t +*)sb)->gstate_sreserved, + (long)sb + MD_SB_BYTES - (long)(((mdp_old_super_t +*)sb)->gstate_sreserved)); + } } else { printk (NO_SB,partition_name(rdev->dev)); goto abort; } - printk(" [events: %08lx]\n", (unsigned long)get_unaligned(&rdev->sb->events)); + printk(" [events: %08lx]\n", (unsigned long)md_read_events(rdev->sb)); ret = 0; abort: if (bh) @@ -532,17 +575,6 @@ abort: return ret; } -static unsigned int calc_sb_csum (mdp_super_t * sb) -{ - unsigned int disk_csum, csum; - - disk_csum = sb->sb_csum; - sb->sb_csum = 0; - csum = csum_partial((void *)sb, MD_SB_BYTES, 0); - sb->sb_csum = disk_csum; - return csum; -} - /* * Check one RAID superblock for generic plausibility */ @@ -569,7 +601,7 @@ static int check_disk_sb (mdk_rdev_t * r goto abort; } - if (calc_sb_csum(sb) != sb->sb_csum) + if (!rdev->csum_valid) printk(BAD_CSUM, partition_name(rdev->dev)); ret = 0; abort: @@ -767,7 +799,7 @@ static void print_sb(mdp_super_t *sb) printk(" UT:%08x ST:%d AD:%d WD:%d FD:%d SD:%d CSUM:%08x E:%08lx\n", sb->utime, sb->state, sb->active_disks, sb->working_disks, sb->failed_disks, sb->spare_disks, - sb->sb_csum, (unsigned long)get_unaligned(&sb->events)); + sb->sb_csum, (unsigned long)md_read_events(sb)); for (i = 0; i < MD_SB_DISKS; i++) { mdp_disk_t *desc; @@ -827,16 +859,16 @@ static int sb_equal ( mdp_super_t *sb1, int ret; mdp_super_t *tmp1, *tmp2; - tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); - tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); + tmp1 = kmalloc(MD_SB_GENERIC_CONSTANT_WORDS * 4,GFP_KERNEL); + tmp2 = kmalloc(MD_SB_GENERIC_CONSTANT_WORDS * 4,GFP_KERNEL); if (!tmp1 || !tmp2) { ret = 0; goto abort; } - *tmp1 = *sb1; - *tmp2 = *sb2; + memcpy(tmp1, sb1, MD_SB_GENERIC_CONSTANT_WORDS * 4); + memcpy(tmp2, sb2, MD_SB_GENERIC_CONSTANT_WORDS * 4); /* * nr_disks is not constant @@ -935,7 +967,25 @@ static int write_disk_sb(mdk_rdev_t * rd } memset(bh->b_data,0,bh->b_size); sb = (mdp_super_t *) bh->b_data; - memcpy(sb, rdev->sb, MD_SB_BYTES); + if (rdev->sb->major_version == 0 && + rdev->sb->minor_version <= 90 && + sizeof(mdp_old_super_t) >= sizeof(mdp_super_t)) { + /* Uh oh, 64bit events member moved half of the superblock */ + memcpy(sb, rdev->sb, (long)&sb->eventslo - (long)sb); + put_unaligned(md_read_events(rdev->sb), &((mdp_old_super_t +*)sb)->events); + memcpy(((mdp_old_super_t *)sb)->gstate_sreserved, +rdev->sb->gstate_sreserved, + (long)sb + MD_SB_BYTES - (long)(((mdp_old_super_t +*)sb)->gstate_sreserved)); + } else + memcpy(sb, rdev->sb, MD_SB_BYTES); + sb->sb_csum = calc_sb_csum(sb); + if (rdev->sb->major_version > 0 || + rdev->sb->minor_version > 90) { + int i; + u32 *sbp = (u32 *) sb; + + for (i = 0; i < MD_SB_WORDS; i++, sbp++) + cpu_to_le32s(sbp); + } mark_buffer_uptodate(bh, 1); mark_buffer_dirty(bh, 1); @@ -985,9 +1035,9 @@ static int sync_sbs(mddev_t * mddev) if (rdev->faulty) continue; sb = rdev->sb; - *sb = *mddev->sb; + memcpy(sb, mddev->sb, MD_SB_BYTES); set_this_disk(mddev, rdev); - sb->sb_csum = calc_sb_csum(sb); + rdev->csum_valid = 1; } return 0; } @@ -1001,9 +1051,9 @@ int md_update_sb(mddev_t * mddev) repeat: mddev->sb->utime = CURRENT_TIME; - ev = get_unaligned(&mddev->sb->events); + ev = md_read_events(mddev->sb); ++ev; - put_unaligned(ev,&mddev->sb->events); + md_write_events(ev,mddev->sb); if (ev == (__u64)0) { /* * oops, this 64-bit counter should never wrap. @@ -1012,7 +1062,7 @@ repeat: */ MD_BUG(); --ev; - put_unaligned(ev,&mddev->sb->events); + md_write_events(ev,mddev->sb); } sync_sbs(mddev); @@ -1038,7 +1088,7 @@ repeat: printk("%s ", partition_name(rdev->dev)); if (!rdev->faulty) { printk("[events: %08lx]", - (unsigned long)get_unaligned(&rdev->sb->events)); + (unsigned long)md_read_events(rdev->sb)); err += write_disk_sb(rdev); } else printk(")\n"); @@ -1124,7 +1174,8 @@ static int md_import_device (kdev_t newd rdev->old_dev = MKDEV(rdev->sb->this_disk.major, rdev->sb->this_disk.minor); rdev->desc_nr = rdev->sb->this_disk.number; - } + } else + rdev->csum_valid = 1; md_list_add(&rdev->all, &all_raid_disks); MD_INIT_LIST_HEAD(&rdev->pending); @@ -1220,16 +1271,16 @@ static int analyze_sbs (mddev_t * mddev) * only as a last resort. (decrease it's age by * one event) */ - if (calc_sb_csum(rdev->sb) != rdev->sb->sb_csum) { - __u64 ev = get_unaligned(&rdev->sb->events); + if (!rdev->csum_valid) { + __u64 ev = md_read_events(rdev->sb); if (ev != (__u64)0) { --ev; - put_unaligned(ev,&rdev->sb->events); + md_write_events(ev,rdev->sb); } } printk("%s's event counter: %08lx\n", partition_name(rdev->dev), - (unsigned long)get_unaligned(&rdev->sb->events)); + (unsigned long)md_read_events(rdev->sb)); if (!freshest) { freshest = rdev; continue; @@ -1237,8 +1288,8 @@ static int analyze_sbs (mddev_t * mddev) /* * Find the newest superblock version */ - ev1 = get_unaligned(&rdev->sb->events); - ev2 = get_unaligned(&freshest->sb->events); + ev1 = md_read_events(rdev->sb); + ev2 = md_read_events(freshest->sb); if (ev1 != ev2) { out_of_date = 1; if (ev1 > ev2) @@ -1249,7 +1300,7 @@ static int analyze_sbs (mddev_t * mddev) printk(OUT_OF_DATE); printk("freshest: %s\n", partition_name(freshest->dev)); } - memcpy (sb, freshest->sb, sizeof(*sb)); + memcpy (sb, freshest->sb, MD_SB_BYTES); /* * at this point we have picked the 'best' superblock @@ -1262,8 +1313,8 @@ static int analyze_sbs (mddev_t * mddev) * Kick all non-fresh devices faulty */ __u64 ev1, ev2; - ev1 = get_unaligned(&rdev->sb->events); - ev2 = get_unaligned(&sb->events); + ev1 = md_read_events(rdev->sb); + ev2 = md_read_events(sb); ++ev1; if (ev1 < ev2) { printk("md: kicking non-fresh %s from array!\n", @@ -1283,8 +1334,8 @@ static int analyze_sbs (mddev_t * mddev) MD_BUG(); goto abort; } - ev1 = get_unaligned(&rdev->sb->events); - ev2 = get_unaligned(&sb->events); + ev1 = md_read_events(rdev->sb); + ev2 = md_read_events(sb); ev3 = ev2; --ev3; if ((rdev->dev != rdev->old_dev) && @@ -1451,7 +1502,7 @@ abort: static int device_size_calculation (mddev_t * mddev) { - int data_disks = 0, persistent; + int data_disks, persistent; unsigned int readahead; mdp_super_t *sb = mddev->sb; struct md_list_head *tmp; @@ -1463,6 +1514,7 @@ static int device_size_calculation (mdde * because device size has to be modulo chunk_size) */ persistent = !mddev->sb->not_persistent; + mddev->reserved = calc_dev_reserved(mddev); ITERATE_RDEV(mddev,rdev,tmp) { if (rdev->faulty) continue; @@ -1480,23 +1532,18 @@ static int device_size_calculation (mdde } } + data_disks = 1; switch (sb->level) { case -3: - data_disks = 1; - break; case -2: - data_disks = 1; - break; - case -1: - zoned_raid_size(mddev); - data_disks = 1; break; case 0: - zoned_raid_size(mddev); data_disks = sb->raid_disks; + /* Fall through */ + case -1: + zoned_raid_size(mddev); break; case 1: - data_disks = 1; break; case 4: case 5: @@ -2086,7 +2133,7 @@ static int get_version (void * arg) } #define SET_FROM_SB(x) info.x = mddev->sb->x -static int get_array_info (mddev_t * mddev, void * arg) +static int get_array_info (mddev_t * mddev, unsigned int cmd, void * arg) { mdu_array_info_t info; @@ -2103,6 +2150,7 @@ static int get_array_info (mddev_t * mdd SET_FROM_SB(raid_disks); SET_FROM_SB(md_minor); SET_FROM_SB(not_persistent); + SET_FROM_SB(reserved_bytes); SET_FROM_SB(utime); SET_FROM_SB(state); @@ -2114,7 +2162,10 @@ static int get_array_info (mddev_t * mdd SET_FROM_SB(layout); SET_FROM_SB(chunk_size); - if (md_copy_to_user(arg, &info, sizeof(info))) + if (cmd == OLD_GET_ARRAY_INFO) { + if (md_copy_to_user(arg, &info, sizeof(mdu_old_array_info_t))) + return -EFAULT; + } else if (md_copy_to_user(arg, &info, sizeof(info))) return -EFAULT; return 0; @@ -2403,7 +2454,7 @@ abort_export: } #define SET_SB(x) mddev->sb->x = info.x -static int set_array_info (mddev_t * mddev, void * arg) +static int set_array_info (mddev_t * mddev, unsigned int cmd, void * arg) { mdu_array_info_t info; @@ -2413,14 +2464,30 @@ static int set_array_info (mddev_t * mdd return -EBUSY; } - if (md_copy_from_user(&info, arg, sizeof(info))) + if (cmd == OLD_SET_ARRAY_INFO) { + memset(&info, 0, sizeof(info)); + if (md_copy_from_user(&info, arg, + sizeof(mdu_old_array_info_t))) + return -EFAULT; + } else if (md_copy_from_user(&info, arg, sizeof(info))) return -EFAULT; if (alloc_array_sb(mddev)) return -ENOMEM; mddev->sb->major_version = MD_MAJOR_VERSION; - mddev->sb->minor_version = MD_MINOR_VERSION; + if (MD_MAJOR_VERSION == 0 && MD_MINOR_VERSION == 91 && + sizeof(mdp_old_super_t) == sizeof(mdp_super_t) && + !info.reserved_bytes) + /* Change between 0.90 and 0.91 is relevant only + * to architectures where 0.90 superblock was longer + * than MD_SB_BYTES (unless non-zero reserved_bytes is + * used), so lets do users of ia32 a favor + * and stay up and down compatible. + */ + mddev->sb->minor_version = 90; + else + mddev->sb->minor_version = MD_MINOR_VERSION; mddev->sb->patch_version = MD_PATCHLEVEL_VERSION; mddev->sb->ctime = CURRENT_TIME; @@ -2440,6 +2507,9 @@ static int set_array_info (mddev_t * mdd SET_SB(layout); SET_SB(chunk_size); + if (info.major_version > 0 || info.minor_version >= 91) + SET_SB(reserved_bytes); + mddev->sb->md_magic = MD_SB_MAGIC; /* @@ -2567,6 +2637,7 @@ static int md_ioctl (struct inode *inode switch (cmd) { case SET_ARRAY_INFO: + case OLD_SET_ARRAY_INFO: case START_ARRAY: if (mddev) { printk("array md%d already exists!\n", @@ -2580,6 +2651,7 @@ static int md_ioctl (struct inode *inode switch (cmd) { case SET_ARRAY_INFO: + case OLD_SET_ARRAY_INFO: mddev = alloc_mddev(dev); if (!mddev) { err = -ENOMEM; @@ -2593,7 +2665,7 @@ static int md_ioctl (struct inode *inode printk("ioctl, reason %d, cmd %d\n", err, cmd); goto abort; } - err = set_array_info(mddev, (void *)arg); + err = set_array_info(mddev, cmd, (void *)arg); if (err) { printk("couldnt set array info. %d\n", err); goto abort; @@ -2635,7 +2707,8 @@ static int md_ioctl (struct inode *inode switch (cmd) { case GET_ARRAY_INFO: - err = get_array_info(mddev, (void *)arg); + case OLD_GET_ARRAY_INFO: + err = get_array_info(mddev, cmd, (void *)arg); goto done_unlock; case GET_DISK_INFO: @@ -3895,7 +3968,7 @@ md__initfunc(void do_md_setup(char *str, chunk_size = ints[i++]; /* Chunksize */ fault = ints[i++]; /* Faultlevel */ - pers = pers | chunk_size | (fault << FAULT_SHIFT); + pers = pers | chunk_size | (fault << FAULT_SHIFT); while( str && (dev = name_to_kdev_t(str))) { do_md_add (minor, dev); @@ -4018,7 +4091,8 @@ static void md_geninit (struct gendisk * md_gendisk.part[i].nr_sects = 0; } - printk("md.c: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); + if (sizeof(mdp_super_t) != MD_SB_BYTES) + panic("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); blksize_size[MD_MAJOR] = md_blocksizes; md_set_global_readahead(md_maxreadahead); @@ -4027,4 +4101,3 @@ static void md_geninit (struct gendisk * proc_register(&proc_root, &proc_md); #endif } - --- linux/drivers/block/linear.c.jj Mon Oct 4 14:21:20 1999 +++ linux/drivers/block/linear.c Thu Mar 9 15:09:46 2000 @@ -151,7 +151,7 @@ static int linear_map (mddev_t *mddev, k block, kdevname(tmp_dev->dev), tmp_dev->size, tmp_dev->offset); *rdev = tmp_dev->dev; - *rsector = (block - tmp_dev->offset) << 1; + *rsector = (block - tmp_dev->offset + mddev->reserved) << 1; return 0; } --- linux/drivers/block/raid0.c.jj Mon Jan 24 11:36:42 2000 +++ linux/drivers/block/raid0.c Fri Mar 10 14:58:01 2000 @@ -143,8 +143,8 @@ static int raid0_run (mddev_t *mddev) printk("raid0 : nb_zone is %d.\n", nb_zone); conf->nr_zones = nb_zone; - printk("raid0 : Allocating %d bytes for hash.\n", - sizeof(struct raid0_hash)*nb_zone); + printk("raid0 : Allocating %ld bytes for hash.\n", + (long)sizeof(struct raid0_hash)*nb_zone); conf->hash_table = vmalloc (sizeof (struct raid0_hash)*nb_zone); if (!conf->hash_table) @@ -229,7 +229,7 @@ static int raid0_map (mddev_t *mddev, kd struct strip_zone *zone; mdk_rdev_t *tmp_dev; int blk_in_chunk, chunksize_bits, chunk, chunk_size; - long block, rblock; + unsigned long block, rblock; chunk_size = mddev->param.chunk_size >> 10; chunksize_bits = ffz(~chunk_size); @@ -237,7 +237,7 @@ static int raid0_map (mddev_t *mddev, kd hash = conf->hash_table + block / conf->smallest->size; if (hash - conf->hash_table > conf->nr_zones) { - printk(KERN_DEBUG "raid0_map: invalid block %ul\n", block); + printk(KERN_DEBUG "raid0_map: invalid block %lu\n", block); return -1; } @@ -261,7 +261,7 @@ static int raid0_map (mddev_t *mddev, kd blk_in_chunk = block & (chunk_size -1); chunk = (block - zone->zone_offset) / (zone->nb_dev << chunksize_bits); tmp_dev = zone->dev[(block >> chunksize_bits) % zone->nb_dev]; - rblock = (chunk << chunksize_bits) + blk_in_chunk + zone->dev_offset; + rblock = (chunk << chunksize_bits) + blk_in_chunk + zone->dev_offset + +mddev->reserved; *rdev = tmp_dev->dev; *rsector = rblock << 1; --- linux/drivers/block/raid5.c.jj Mon Oct 4 14:22:12 1999 +++ linux/drivers/block/raid5.c Thu Mar 9 17:24:46 2000 @@ -586,7 +586,7 @@ static void raid5_build_block (struct st mddev_t *mddev = conf->mddev; char *b_data; kdev_t dev = mddev_to_kdev(mddev); - int block = sh->sector / (sh->size >> 9); + int block = sh->sector / (sh->size >> 9) + (mddev->reserved << 1); b_data = ((volatile struct buffer_head *) bh)->b_data; memset (bh, 0, sizeof (struct buffer_head)); @@ -1462,7 +1462,7 @@ static int __check_consistency (mddev_t static int check_consistency (mddev_t *mddev) { - if (__check_consistency(mddev, 0)) + if (__check_consistency(mddev, mddev->reserved)) /* * We are not checking this currently, as it's legitimate to have * an inconsistent array, at creation time. --- linux/drivers/block/raid1.c.jj Mon Oct 4 14:22:09 1999 +++ linux/drivers/block/raid1.c Thu Mar 16 18:15:08 2000 @@ -40,7 +40,7 @@ static void * raid1_kmalloc (int size) * simply can not afford to fail an allocation because * there is no failure return path (eg. make_request()) */ - while (!(ptr = kmalloc (sizeof (raid1_conf_t), GFP_KERNEL))) + while (!(ptr = kmalloc (size, GFP_KERNEL))) printk ("raid1: out of memory, retrying...\n"); memset(ptr, 0, size); @@ -266,6 +266,7 @@ static int raid1_make_request (mddev_t * memcpy(bh_req, bh, sizeof(*bh)); bh_req->b_end_io = raid1_end_request; bh_req->b_dev_id = r1_bh; + bh_req->b_rsector += (mddev->reserved << 1); map_and_make_request (rw, bh_req); return 0; } @@ -311,7 +312,7 @@ static int raid1_make_request (mddev_t * mirror_bh[i]->b_blocknr = bh->b_blocknr; mirror_bh[i]->b_dev = bh->b_dev; mirror_bh[i]->b_rdev = conf->mirrors[i].dev; - mirror_bh[i]->b_rsector = bh->b_rsector; + mirror_bh[i]->b_rsector = bh->b_rsector + (mddev->reserved << 1); mirror_bh[i]->b_state = (1<<BH_Req) | (1<<BH_Dirty); if (lowprio) mirror_bh[i]->b_state |= (1<<BH_LowPrio); @@ -866,7 +867,7 @@ static int __check_consistency (mddev_t static int check_consistency (mddev_t *mddev) { - if (__check_consistency(mddev, 0)) + if (__check_consistency(mddev, mddev->reserved)) /* * we do not do this currently, as it's perfectly possible to * have an inconsistent array when it's freshly created. Only --- linux/include/linux/raid/md_p.h.jj Mon Oct 4 15:41:29 1999 +++ linux/include/linux/raid/md_p.h Fri Mar 10 09:55:05 2000 @@ -115,8 +115,9 @@ typedef struct mdp_superblock_s { __u32 not_persistent; /* 12 does it have a persistent superblock */ __u32 set_uuid1; /* 13 Raid set identifier #2 */ __u32 set_uuid2; /* 14 Raid set identifier #3 */ - __u32 set_uuid3; /* 14 Raid set identifier #4 */ - __u32 gstate_creserved[MD_SB_GENERIC_CONSTANT_WORDS - 16]; + __u32 set_uuid3; /* 15 Raid set identifier #4 */ + __u32 reserved_bytes; /* 16 # of reserv. bytes at start of disks */ + __u32 gstate_creserved[MD_SB_GENERIC_CONSTANT_WORDS - 17]; /* * Generic state information @@ -128,7 +129,8 @@ typedef struct mdp_superblock_s { __u32 failed_disks; /* 4 Number of failed disks */ __u32 spare_disks; /* 5 Number of spare disks */ __u32 sb_csum; /* 6 checksum of the whole superblock */ - __u64 events; /* 7 number of superblock updates (64-bit!) */ + __u32 eventslo; /* 7 number of superblock updates (low bits) */ + __u32 eventshi; /* 8 number of superblock updates (high bits)*/ __u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 9]; /* @@ -156,6 +158,47 @@ typedef struct mdp_superblock_s { mdp_disk_t this_disk; } mdp_super_t; + +typedef struct mdp_old_superblock_s { + /* + * Constant generic information + */ + __u32 gstate_c[MD_SB_GENERIC_CONSTANT_WORDS]; + + /* + * Generic state information + */ + __u32 utime; /* 0 Superblock update time */ + __u32 state; /* 1 State bits (clean, ...) */ + __u32 active_disks; /* 2 Number of currently active disks */ + __u32 working_disks; /* 3 Number of working disks */ + __u32 failed_disks; /* 4 Number of failed disks */ + __u32 spare_disks; /* 5 Number of spare disks */ + __u32 sb_csum; /* 6 checksum of the whole superblock */ + __u64 events; /* 7(8) number of superblock updates (64bit) */ + __u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 9]; + + /* + * Personality information + */ + __u32 pstate_c[MD_SB_PERSONALITY_WORDS]; + + /* + * Disks information + */ + mdp_disk_t disks[MD_SB_DISKS]; + + /* + * Reserved + */ + __u32 reserved[MD_SB_RESERVED_WORDS]; + + /* + * Active descriptor + */ + mdp_disk_t this_disk; + +} mdp_old_super_t; #endif _MD_P_H --- linux/include/linux/raid/md.h.jj Wed Feb 16 08:06:11 2000 +++ linux/include/linux/raid/md.h Fri Mar 10 14:30:47 2000 @@ -57,7 +57,7 @@ * Different patchlevel versions are downward and upward compatible. */ #define MD_MAJOR_VERSION 0 -#define MD_MINOR_VERSION 90 +#define MD_MINOR_VERSION 91 #define MD_PATCHLEVEL_VERSION 0 extern int md_size[MAX_MD_DEVS]; --- linux/include/linux/raid/md_k.h.jj Mon Oct 4 15:41:29 1999 +++ linux/include/linux/raid/md_k.h Thu Mar 9 14:15:22 2000 @@ -169,6 +169,7 @@ struct mdk_rdev_s mdp_super_t *sb; int sb_offset; + int csum_valid; int faulty; /* if faulty do not issue IO requests */ int desc_nr; /* descriptor index in the superblock */ @@ -197,6 +198,7 @@ struct mddev_s int sb_dirty; mdu_param_t param; int ro; + int reserved; unsigned int curr_resync; unsigned long resync_start; char *name; --- linux/include/linux/raid/md_u.h.jj Mon Oct 4 15:41:29 1999 +++ linux/include/linux/raid/md_u.h Thu Mar 9 14:50:12 2000 @@ -20,6 +20,7 @@ /* status */ #define RAID_VERSION _IOR (MD_MAJOR, 0x10, mdu_version_t) #define GET_ARRAY_INFO _IOR (MD_MAJOR, 0x11, mdu_array_info_t) +#define OLD_GET_ARRAY_INFO _IOR (MD_MAJOR, 0x11, mdu_old_array_info_t) #define GET_DISK_INFO _IOR (MD_MAJOR, 0x12, mdu_disk_info_t) #define PRINT_RAID_DEBUG _IO (MD_MAJOR, 0x13) @@ -28,6 +29,7 @@ #define ADD_NEW_DISK _IOW (MD_MAJOR, 0x21, mdu_disk_info_t) #define HOT_REMOVE_DISK _IO (MD_MAJOR, 0x22) #define SET_ARRAY_INFO _IOW (MD_MAJOR, 0x23, mdu_array_info_t) +#define OLD_SET_ARRAY_INFO _IOW (MD_MAJOR, 0x23, mdu_old_array_info_t) #define SET_DISK_INFO _IO (MD_MAJOR, 0x24) #define WRITE_RAID_INFO _IO (MD_MAJOR, 0x25) #define UNPROTECT_ARRAY _IO (MD_MAJOR, 0x26) @@ -77,9 +79,53 @@ typedef struct mdu_array_info_s { * Personality information */ int layout; /* 0 the array's physical layout */ - int chunk_size; /* 1 chunk size in bytes */ + int chunk_size; /* 1 chunk size in bytes */ + + int reserved_bytes; /* Number of reserved bytes at the beginning */ + + /* + * The meaning of these fields can be specified later on + * and will be dependent on major/minor version specified + * in this structure. + * This is so that the ioctl number does not have to change with + * every field addition. + */ + int reserved[32 - 19]; } mdu_array_info_t; + +typedef struct mdu_old_array_info_s { + /* + * Generic constant information + */ + int major_version; + int minor_version; + int patch_version; + int ctime; + int level; + int size; + int nr_disks; + int raid_disks; + int md_minor; + int not_persistent; + + /* + * Generic state information + */ + int utime; /* 0 Superblock update time */ + int state; /* 1 State bits (clean, ...) */ + int active_disks; /* 2 Number of currently active disks */ + int working_disks; /* 3 Number of working disks */ + int failed_disks; /* 4 Number of failed disks */ + int spare_disks; /* 5 Number of spare disks */ + + /* + * Personality information + */ + int layout; /* 0 the array's physical layout */ + int chunk_size; /* 1 chunk size in bytes */ + +} mdu_old_array_info_t; typedef struct mdu_disk_info_s { /* --- linux/include/linux/raid/raid1.h.jj Fri Mar 10 14:32:31 2000 +++ linux/include/linux/raid/raid1.h Thu Mar 16 14:29:13 2000 @@ -29,8 +29,8 @@ struct raid1_private_data { int last_used; unsigned long next_sect; int sect_count; - mdk_thread_t *thread, *resync_thread; int resync_mirrors; + mdk_thread_t *thread, *resync_thread; struct mirror_info *spare; };
--- raidtools-0.90/md-int.h.jj Tue Aug 3 10:05:53 1999 +++ raidtools-0.90/md-int.h Fri Mar 10 14:10:31 2000 @@ -20,6 +20,7 @@ /* don't include the kernel RAID header! */ #define _MD_H +typedef unsigned long long md_u64; typedef unsigned int md_u32; typedef unsigned short md_u16; typedef unsigned char md_u8; @@ -52,6 +53,7 @@ struct md_version { /* status */ #define RAID_VERSION _IOR (MD_MAJOR, 0x10, struct md_version) #define GET_ARRAY_INFO _IOR (MD_MAJOR, 0x11, md_array_info_t) +#define OLD_GET_ARRAY_INFO _IOR (MD_MAJOR, 0x11, md_old_array_info_t) #define GET_DISK_INFO _IOR (MD_MAJOR, 0x12, md_disk_info_t) #define PRINT_RAID_DEBUG _IO (MD_MAJOR, 0x13) @@ -60,6 +62,8 @@ struct md_version { #define ADD_NEW_DISK _IOW (MD_MAJOR, 0x21, md_disk_info_t) #define HOT_REMOVE_DISK _IO (MD_MAJOR, 0x22) #define SET_ARRAY_INFO _IOW (MD_MAJOR, 0x23, md_array_info_t) +#define OLD_SET_ARRAY_INFO _IOW (MD_MAJOR, 0x23, md_old_array_info_t) + #define SET_DISK_INFO _IO (MD_MAJOR, 0x24) #define WRITE_RAID_INFO _IO (MD_MAJOR, 0x25) #define UNPROTECT_ARRAY _IO (MD_MAJOR, 0x26) @@ -176,14 +180,19 @@ typedef struct md_superblock_s { md_u32 minor_version; /* 2 minor version ... */ md_u32 patch_version; /* 3 patchlevel version ... */ md_u32 gvalid_words; /* 4 Number of used words in this section */ - md_u32 set_magic; /* 5 Raid set identifier */ + md_u32 set_uuid0; /* 5 Raid set identifier */ md_u32 ctime; /* 6 Creation time */ md_u32 level; /* 7 Raid personality */ md_u32 size; /* 8 Apparent size of each individual disk */ md_u32 nr_disks; /* 9 total disks in the raid set */ md_u32 raid_disks; /* 10 disks in a fully functional raid set */ md_u32 md_minor; /* 11 preferred MD minor device number */ - md_u32 gstate_creserved[MD_SB_GENERIC_CONSTANT_WORDS - 12]; + md_u32 not_persistent; /* 12 does it have a persistent superblock */ + md_u32 set_uuid1; /* 13 Raid set identifier #2 */ + md_u32 set_uuid2; /* 14 Raid set identifier #3 */ + md_u32 set_uuid3; /* 15 Raid set identifier #4 */ + md_u32 reserved_bytes; /* 16 # of reserv. bytes at start of disks */ + md_u32 gstate_creserved[MD_SB_GENERIC_CONSTANT_WORDS - 17]; /* * Generic state information @@ -194,14 +203,19 @@ typedef struct md_superblock_s { md_u32 working_disks; /* 3 Number of working disks */ md_u32 failed_disks; /* 4 Number of failed disks */ md_u32 spare_disks; /* 5 Number of spare disks */ - md_u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 6]; + md_u32 sb_csum; /* 6 checksum of the whole superblock */ + md_u32 eventslo; /* 7 number of superblock updates (low bits) */ + md_u32 eventshi; /* 8 number of superblock updates (high bits)*/ + md_u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 9]; /* * Personality information */ md_u32 layout; /* 0 the array's physical layout */ md_u32 chunk_size; /* 1 chunk size in bytes */ - md_u32 pstate_reserved[MD_SB_PERSONALITY_WORDS - 2]; + md_u32 root_pv; /* 2 LV root PV */ + md_u32 root_block; /* 3 LV root block */ + md_u32 pstate_reserved[MD_SB_PERSONALITY_WORDS - 4]; /* * Disks information @@ -220,6 +234,49 @@ typedef struct md_superblock_s { } md_superblock_t; +typedef struct md_old_superblock_s { + /* + * Constant generic information + */ + md_u32 gstate_c[MD_SB_GENERIC_CONSTANT_WORDS]; + + /* + * Generic state information + */ + md_u32 utime; /* 0 Superblock update time */ + md_u32 state; /* 1 State bits (clean, ...) */ + md_u32 active_disks; /* 2 Number of currently active disks */ + md_u32 working_disks; /* 3 Number of working disks */ + md_u32 failed_disks; /* 4 Number of failed disks */ + md_u32 spare_disks; /* 5 Number of spare disks */ + md_u32 sb_csum; /* 6 checksum of the whole superblock */ + md_u64 events; /* 7(8) number of superblock updates (64bit) */ + md_u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 9]; + + /* + * Personality information + */ + md_u32 pstate_c[MD_SB_PERSONALITY_WORDS]; + + /* + * Disks information + */ + md_descriptor_t disks[MD_SB_DISKS]; + + /* + * Reserved + */ + md_u32 reserved[MD_SB_RESERVED_WORDS]; + + /* + * Active descriptor + */ + md_descriptor_t this_disk; + +} md_old_superblock_t; + + + /* * options passed in raidstart: */ @@ -264,7 +321,51 @@ typedef struct md_array_info_s { md_u32 layout; /* 0 the array's physical layout */ md_u32 chunk_size; /* 1 chunk size in bytes */ + md_u32 reserved_bytes; /* Number of reserved bytes at the beginning */ + + /* + * The meaning of these fields can be specified later on + * and will be dependent on major/minor version specified + * in this structure. + * This is so that the ioctl number does not have to change with + * every field addition. + */ + md_u32 reserved[32 - 19]; + } md_array_info_t; + +typedef struct md_old_array_info_s { + /* + * Generic constant information + */ + md_u32 major_version; + md_u32 minor_version; + md_u32 patch_version; + md_u32 ctime; + md_u32 level; + md_u32 size; + md_u32 nr_disks; + md_u32 raid_disks; + md_u32 md_minor; + md_u32 not_persistent; + + /* + * Generic state information + */ + md_u32 utime; /* 0 Superblock update time */ + md_u32 state; /* 1 State bits (clean, ...) */ + md_u32 active_disks; /* 2 Number of currently active disks */ + md_u32 working_disks; /* 3 Number of working disks */ + md_u32 failed_disks; /* 4 Number of failed disks */ + md_u32 spare_disks; /* 5 Number of spare disks */ + + /* + * Personality information + */ + md_u32 layout; /* 0 the array's physical layout */ + md_u32 chunk_size; /* 1 chunk size in bytes */ + +} md_old_array_info_t; typedef struct md_disk_info_s { /* --- raidtools-0.90/common.h.jj Tue Aug 3 10:05:53 1999 +++ raidtools-0.90/common.h Thu Feb 24 12:16:29 2000 @@ -39,7 +39,7 @@ typedef int kdev_t; #define RAID_CONFIG "/etc/raidtab" #define MKRAID_MAJOR_VERSION (0) -#define MKRAID_MINOR_VERSION (90) +#define MKRAID_MINOR_VERSION (91) #define MKRAID_PATCHLEVEL_VERSION (0) extern int do_quiet_flag; --- raidtools-0.90/raid_io.c.jj Tue Aug 3 10:05:53 1999 +++ raidtools-0.90/raid_io.c Fri Mar 10 17:56:31 2000 @@ -16,6 +16,7 @@ #include <linux/fs.h> /* for BLKGETSIZE */ #endif #include <sys/sysmacros.h> +#include <endian.h> #ifndef BLOCK_SIZE #define BLOCK_SIZE 1024 @@ -34,6 +35,35 @@ md_cfg_entry_t *p; md_superblock_t *sb; +#if __BYTE_ORDER == __LITTLE_ENDIAN +#define raid_le16(x) (md_u16)(x) +#define raid_le32(x) (md_u32)(x) +#define raid_be16(x) \ + (md_u16)( \ + (((md_u16)(x) & (md_u16)0xff00) >> 8) | \ + (((md_u16)(x) & (md_u16)0x00ff) << 8)) +#define raid_be32(x) \ + ((md_u32)( \ + (((md_u32)(x) & (md_u32)0x000000ffUL) << 24) | \ + (((md_u32)(x) & (md_u32)0x0000ff00UL) << 8) | \ + (((md_u32)(x) & (md_u32)0x00ff0000UL) >> 8) | \ + (((md_u32)(x) & (md_u32)0xff000000UL) >> 24) )) +#elif __BYTE_ORDER == __BIG_ENDIAN +#define raid_le16(x) \ + (md_u16)( \ + (((md_u16)(x) & (md_u16)0xff00) >> 8) | \ + (((md_u16)(x) & (md_u16)0x00ff) << 8)) +#define raid_le32(x) \ + ((md_u32)( \ + (((md_u32)(x) & (md_u32)0x000000ffUL) << 24) | \ + (((md_u32)(x) & (md_u32)0x0000ff00UL) << 8) | \ + (((md_u32)(x) & (md_u32)0x00ff0000UL) >> 8) | \ + (((md_u32)(x) & (md_u32)0xff000000UL) >> 24) )) +#define raid_be16(x) (md_u16)(x) +#define raid_be32(x) (md_u32)(x) +#else +#error Unknown byte order +#endif #define TIME long long @@ -81,7 +111,7 @@ void progress (unsigned long blocks, uns } #undef F -#if !(defined(__alpha__) || defined(__sparc_v9__)) +#if !(defined(__alpha__) || (defined(__sparc__) && defined(__arch64__))) # ifndef __NR__llseek # ifdef __sparc__ # define __NR__llseek 236 @@ -105,7 +135,7 @@ long long raidseek (unsigned int fd, uns long long result; int retval; -#if defined(__alpha__) || defined(__sparc_v9__) +#if defined(__alpha__) || (defined(__sparc__) && defined(__arch64__)) return lseek(fd, offset, SEEK_SET); #else retval = _llseek (fd, ((unsigned long long) offset) >> 32, @@ -115,11 +145,39 @@ long long raidseek (unsigned int fd, uns #endif } -int upgrade_sb (int fd, md_superblock_t *sb, md_cfg_entry_t * cfg, int verbose) +/* Check if this RAID device might contain some embedded partition table. + * In that case we try to reserve bytes at the beginning of the RAID array, + * so that it does not get smashed. + */ +int check_partition_table (int fd, char **name) +{ + md_u16 sun_disk_label[256]; + + if (raidseek(fd, 0) == -1) + return -1; + if (read(fd, sun_disk_label, sizeof(sun_disk_label)) + != sizeof(sun_disk_label)) + return -1; + if (raid_be16(sun_disk_label[254]) == 0xDABE) { + md_u16 csum, *p; + + for (csum = 0, p = sun_disk_label; + p < sun_disk_label + 256; p++) + csum ^= *p; + if (!csum) { + *name = "Sun disk label"; + return 1024; + } + } + return 0; +} + +static int upgrade_sb (int fd, md_superblock_t *sb, md_cfg_entry_t * cfg, int verbose) { struct stat stat_buf; md_descriptor_t *disk; int i; + int mkraid_minor_version = MKRAID_MINOR_VERSION; if ( (sb->major_version == MKRAID_MAJOR_VERSION) && @@ -130,6 +188,22 @@ int upgrade_sb (int fd, md_superblock_t } if ( + (MKRAID_MAJOR_VERSION == 0) && + (MKRAID_MINOR_VERSION == 91) && + (sb->major_version == MKRAID_MAJOR_VERSION) && + (sizeof(md_superblock_t) == sizeof(md_old_superblock_t))) { + if (sb->minor_version == 90) { + /* No need to upgrade from 0.90 to 0.91 on ia32 and + * other archs where nothing moves between those + * two versions. */ + fprintf(stderr, "array needs no upgrade\n"); + return 1; + } + if (sb->minor_version < 90) + mkraid_minor_version = 90; + } + + if ( (sb->major_version > MKRAID_MAJOR_VERSION) || ((sb->major_version == MKRAID_MAJOR_VERSION) && (sb->minor_version > MKRAID_MINOR_VERSION)) || @@ -143,18 +217,18 @@ int upgrade_sb (int fd, md_superblock_t if (verbose) { printf("MD ID: %x\n", sb->md_magic); printf("Changing MD version from %d.%d.%d to %d.%d.%d.\n", - sb->major_version, sb->minor_version,sb->patch_version, - MKRAID_MAJOR_VERSION, MKRAID_MINOR_VERSION, + sb->major_version, sb->minor_version, sb->patch_version, + MKRAID_MAJOR_VERSION, mkraid_minor_version, MKRAID_PATCHLEVEL_VERSION); } sb->major_version = MKRAID_MAJOR_VERSION; - sb->minor_version = MKRAID_MINOR_VERSION; + sb->minor_version = mkraid_minor_version; sb->patch_version = MKRAID_PATCHLEVEL_VERSION; if (verbose) if ((sb->major_version > 0) || (sb->minor_version >= 50)) - printf("preferred minor %d (md%d)\n", sb->md_minor, sb->md_minor); + printf("preferred minor %d (md%d)\n", sb->md_minor, +sb->md_minor); if (stat(cfg->md_name,&stat_buf)) { fprintf(stderr, "%s: file doesn't exist!\n", cfg->md_name); return 1; @@ -224,7 +298,7 @@ void print_sb (md_superblock_t *sb) sb->md_minor); printf("gvalid_words: %d\n", sb->gvalid_words); - printf("Raid set ID: %x\n", sb->set_magic); + printf("Raid set ID: %x %x %x %x\n", sb->set_uuid0, sb->set_uuid1, +sb->set_uuid2, sb->set_uuid3); t = (time_t) sb->ctime; printf("Creation time: %s", ctime(&t)); t = (time_t) sb->utime; @@ -245,6 +319,7 @@ void print_sb (md_superblock_t *sb) printf("Number of working disks: %d\n", sb->working_disks); printf("Number of failed disks: %d\n", sb->failed_disks); printf("Number of spare disks: %d\n", sb->spare_disks); + printf("Reserved bytes: %d\n", sb->reserved_bytes); printf("\n"); for (i = 0; i < sb->nr_disks; i++) { @@ -262,13 +337,16 @@ void print_sb (md_superblock_t *sb) } } -static int sanity_checks (char *name, int fd, int sb_offset, - int forceSanity, int upgradeArray, md_cfg_entry_t * cfg, int dowrite) +static int sanity_checks (struct md_version * ver, char *name, int fd, + int sb_offset, int forceSanity, int upgradeArray, + md_cfg_entry_t * cfg, int dowrite) { FILE *fp; unsigned char tmp[MAX_LINE_LENGTH]; unsigned char buffer[MD_SB_BYTES]; md_superblock_t *phys_sb; + char *part_name; + int reserve; /* * Check if the device is mounted @@ -288,14 +366,41 @@ static int sanity_checks (char *name, in fclose(fp); if (!upgradeArray) { - if (forceSanity) - return 0; if (cfg->array.param.not_persistent) /* * We have no business analyzing the contents * of a superblock-less array. */ return 0; + + reserve = check_partition_table(fd, &part_name); + if (reserve < 0) { + fprintf(stderr, "%s: couldn't read from the start of the +disk\n", name); + return 1; + } + + if (cfg->array.param.reserved_bytes & 1) { + /* reserved-bytes was not mentioned in the config file */ + if (reserve && !ver->major && ver->minor <= 90) { + if (forceSanity) + return 0; + fprintf(stderr, "%s appears to contain an embedded %s +partition table.\n" + "Use -f to override.\n", name, +part_name); + return 1; + } else if (reserve > (cfg->array.param.reserved_bytes & ~1)) { + printf("%s appears to contain an embedded %s partition +table.\n" + "Assuming %d reserved-bytes.\n", name, +part_name, reserve); + cfg->array.param.reserved_bytes = reserve | 1; + } + } else if (reserve > cfg->array.param.reserved_bytes && !forceSanity) { + fprintf(stderr, "%s appears to contain an embedded %s +partition table which needs\n" + "%d reserved bytes, while only %d +reserved-bytes was requested.\n", + name, part_name, reserve, +cfg->array.param.reserved_bytes); + return 1; + } + + if (forceSanity) + return 0; /* * Check if the device contains an ext2 filesystem */ @@ -312,7 +417,10 @@ static int sanity_checks (char *name, in if ((read(fd, buffer, MD_SB_BYTES)) != MD_SB_BYTES) return 1; phys_sb = (md_superblock_t *) buffer; - if (phys_sb->md_magic == MD_SB_MAGIC) { + if (phys_sb->md_magic == MD_SB_MAGIC || + (raid_le32(phys_sb->md_magic) == MD_SB_MAGIC && + (raid_le32(phys_sb->major_version) > 0 || + raid_le32(phys_sb->minor_version) > 90))) { fprintf(stderr, "%s appears to be already part of a raid array -- use -f to\nforce the destruction of the old superblock\n", name); return 1; } @@ -328,7 +436,29 @@ static int sanity_checks (char *name, in if ((read(fd, buffer, MD_SB_BYTES)) != MD_SB_BYTES) return 1; phys_sb = (md_superblock_t *) buffer; + if (phys_sb->md_magic != MD_SB_MAGIC && + raid_le32(phys_sb->md_magic) == MD_SB_MAGIC && + (raid_le32(phys_sb->major_version) > 0 || + raid_le32(phys_sb->minor_version) > 90)) { + md_u32 *p; + + for (p = (md_u32 *)phys_sb; p < (md_u32 *)(phys_sb + 1); p++) + *p = raid_le32(*p); + } if (phys_sb->md_magic == MD_SB_MAGIC) { + if (phys_sb->major_version == 0 && phys_sb->minor_version == 90 && + sizeof(md_superblock_t) != sizeof(md_old_superblock_t)) { + /* Duh, backwards compatibility. */ + md_u64 events; + + memmove(&events, &((md_old_superblock_t *)phys_sb)->events, +sizeof(md_u64)); + phys_sb->eventslo = events; + phys_sb->eventshi = events >> 32; + memmove(phys_sb->gstate_sreserved, + ((md_old_superblock_t *)phys_sb)->gstate_sreserved, + (long)phys_sb + MD_SB_BYTES - +(long)(((md_old_superblock_t *)phys_sb)->gstate_sreserved)); + } + if (dowrite) { fprintf(stderr, "upgrading superblock on %s ...\n", name); @@ -338,15 +468,24 @@ static int sanity_checks (char *name, in if (upgrade_sb(fd, phys_sb, cfg, dowrite)) return 1; if (dowrite) { + int minor_version = phys_sb->minor_version; fprintf(stderr, "new superblock:\n"); print_sb(phys_sb); if (raidseek(fd, sb_offset) == -1) return 1; + if ((phys_sb->major_version || + phys_sb->minor_version > 90) && + MD_SB_MAGIC != raid_le32(MD_SB_MAGIC)) { + md_u32 *p; + + for (p = (md_u32 *)phys_sb; p < (md_u32 *)(phys_sb + +1); p++) + *p = raid_le32(*p); + } if ((write(fd, buffer, MD_SB_BYTES)) != MD_SB_BYTES) { fprintf(stderr, "could not write new superblock!\n"); return 1; } - printf("sb->minor after write: %d\n", phys_sb->minor_version); + printf("sb->minor after write: %d\n", minor_version); fsync(fd); } return 0; @@ -459,11 +598,10 @@ int analyze_sb (struct md_version * ver, close(fd); return 1; } - cfg->sb_block_offset[i] = MD_NEW_SIZE_BLOCKS(nr_blocks); if (!cfg->array.param.not_persistent) { printf("disk %d: %s, %ukB, raid superblock at %dkB\n", i, cfg->device_name[i], nr_blocks, cfg->sb_block_offset[i]); - if (sanity_checks(cfg->device_name[i], fd, + if (sanity_checks(ver, cfg->device_name[i], fd, cfg->sb_block_offset[i], forceSanity, upgradeArray, cfg, 0)) { close(fd); @@ -475,6 +613,15 @@ int analyze_sb (struct md_version * ver, printf("disk %d: %s, failed\n", i, cfg->device_name[i]); } } + + if (array->param.reserved_bytes & 1) + array->param.reserved_bytes &= ~1; + if (array->param.reserved_bytes && !ver->major && ver->minor <= 90) { + fprintf(stderr, "Non-zero reserved-bytes (%d) is only supported by +kernel RAID driver %d.%d.%d\n", + array->param.reserved_bytes, ver->major, ver->minor, +ver->patchlevel); + return 1; + } + /* * second pass, write stuff out ... */ @@ -502,7 +649,7 @@ int analyze_sb (struct md_version * ver, } cfg->sb_block_offset[i] = MD_NEW_SIZE_BLOCKS(nr_blocks); - if (sanity_checks(cfg->device_name[i], fd, + if (sanity_checks(ver, cfg->device_name[i], fd, cfg->sb_block_offset[i], forceSanity, upgradeArray, cfg, 1)) { close(fd); --- raidtools-0.90/mkraid.c.jj Tue Aug 3 10:05:53 1999 +++ raidtools-0.90/mkraid.c Fri Mar 10 14:02:21 2000 @@ -19,7 +19,7 @@ #include "popt.h" #include "version.h" -void printcfg (md_cfg_entry_t * cfg); +static void printcfg (md_cfg_entry_t * cfg); void usage (void) { printf("usage: mkraid [--configfile] [--version] [--force] [--upgrade]\n"); @@ -43,7 +43,10 @@ int i, ret, file; #endif file = open(cfg->md_name,O_RDONLY); - ret = ioctl(file, SET_ARRAY_INFO, (unsigned long)&cfg->array.param); + if (ver->major || ver->minor > 90) + ret = ioctl(file, SET_ARRAY_INFO, (unsigned long)&cfg->array.param); + else + ret = ioctl(file, OLD_SET_ARRAY_INFO, (unsigned long)&cfg->array.param); if (ret) return 1; @@ -275,7 +278,7 @@ abort: #define P(x) printf("%18s: \t %d\n",#x,cfg->array.param.x) #define DP(x) printf("%18s: \t %d\n",#x,cfg->array.disks[i].x) -void printcfg (md_cfg_entry_t * cfg) +static void printcfg (md_cfg_entry_t * cfg) { int i; @@ -298,6 +301,7 @@ void printcfg (md_cfg_entry_t * cfg) P(layout); P(chunk_size); + P(reserved_bytes); for (i = 0; i < cfg->array.param.nr_disks; i++) { printf("\n"); --- raidtools-0.90/parser.c.jj Tue Aug 3 10:05:53 1999 +++ raidtools-0.90/parser.c Fri Mar 10 11:36:33 2000 @@ -68,6 +68,7 @@ static int process_entry (char *par, cha } strcpy(cfg->md_name, val_s); cfg->array.param.nr_disks = 0; + cfg->array.param.reserved_bytes = 1; /* Autodetect */ last = cfg_head; while (last && last->next) last = last->next; @@ -143,6 +144,13 @@ static int process_entry (char *par, cha return 1; } array->param.chunk_size = val * MD_BLK_SIZ; + return 0; + } else if (strcmp(par, "reserved-bytes") == 0) { + if (val & 511) { + fprintf(stderr, "reserved-bytes %d must be a power of 512\n", +val); + return 1; + } + array->param.reserved_bytes = val; return 0; } else if (strcmp(par, "device") == 0) { if (array->param.nr_disks == MD_SB_DISKS) { --- raidtools-0.90/raid5.conf.sample.jj Tue Aug 3 10:05:53 1999 +++ raidtools-0.90/raid5.conf.sample Fri Mar 10 10:06:30 2000 @@ -3,6 +3,7 @@ raiddev /dev/md0 raid-level 5 nr-raid-disks 3 chunk-size 4 +#reserved-bytes 1024 # Parity placement algorithm --- raidtools-0.90/raid1.conf.sample.jj Tue Aug 3 10:05:53 1999 +++ raidtools-0.90/raid1.conf.sample Fri Mar 10 10:06:46 2000 @@ -4,6 +4,7 @@ raid-level 1 nr-raid-disks 2 nr-spare-disks 0 chunk-size 4 +#reserved-bytes 1024 device /dev/hda1 raid-disk 0 --- raidtools-0.90/raidtab.5.jj Tue Aug 3 10:05:53 1999 +++ raidtools-0.90/raidtab.5 Thu Mar 16 16:20:36 2000 @@ -106,11 +106,20 @@ performance on typical disks with rotati .TP \fBchunk-size \fIsize\fR -Sets the stripe size to \fIsize\fR bytes. Has to be a power of 2 and +Sets the stripe size to \fIsize\fR kilobytes. Has to be a power of 2 and has a compilation-time maximum of 4M. (MAX_CHUNK_SIZE in the kernel driver) typical values are anything from 4k to 128k, the best value should be determined by experimenting on a given array, alot depends on the SCSI and disk configuration. + +.TP +\fBreserved-bytes \fIsize\fR +Reserves at least the first \fIsize\fR bytes on each disks, so that +it can contain e.g. embedded partition tables or bootblocks. +Without this, RAID array can happily resync them with something else. +If reserved-bytes is not specified, then mkraid checks if it finds some +known partition table or bootblock magic and sets the default accordingly. +\fIsize\fR must be a multiple of 512. .TP \fBdevice \fIdevpath\fR --- raidtools-0.90/raidtab.sample.jj Tue Aug 3 10:05:53 1999 +++ raidtools-0.90/raidtab.sample Thu Mar 16 16:16:16 2000 @@ -2,7 +2,8 @@ # sample raiddev configuration file # -# 'persistent' RAID5 setup, with no spare disks: +# 'persistent' RAID5 setup, with no spare disks +# and 4KB chunk size # raiddev /dev/md0 raid-level 5 @@ -51,3 +52,23 @@ raiddev /dev/md2 device /dev/sdc1 spare-disk 0 + +# +# 'persistent' RAID5 setup, with no spare disks +# with 1024 bytes at the start of each disk reserved +# for bootblocks or other things +# +raiddev /dev/md3 + raid-level 5 + nr-raid-disks 3 + nr-spare-disks 0 + persistent-superblock 1 + chunk-size 4 + reserved-bytes 1024 + + device /dev/sdb1 + raid-disk 0 + device /dev/sda1 + raid-disk 1 + device /dev/sdc1 + raid-disk 2