I described some of the problems I have with file and backup management in a kragen-tol post in March 2002, entitled "file catalogs and backups": http://lists.canonical.org/pipermail/kragen-tol/2002-March/000691.html http://www.mail-archive.com/[EMAIL PROTECTED]/msg00037.html
Now, spurred by recent major hardware failures in my life, I've implemented two pieces of the grandiose system I laid out there: - a filesystem indexer that spits out all the metadata for my files (although it doesn't yet look inside of tar and zip files for more files); - a broken program to answer the simplest interesting question I could think of with such a catalog: what files are duplicated in more than one place on the filesystem? They follow. fs-snapshot: #!/usr/local/bin/python # extract enough info about your filesystem that you could, at least # in principle, tell if it had changed, or reconstruct its old state, # if you had the files lying around somewhere; yet this info should # remain relatively small, so that it's practical to keep it close at # hand. # With just pathname, adler32, CRC32, MD5, and SHA-1, this data adds # up to about 64 bytes per file gzipped, or 164 ungzipped, and it # processes about 189 files (or 4.27 megs) per second. This means # that gentle's root partition's 288613 files consuming 8.2 gigabytes # should take no more than 1920 seconds to index (or 1527 if the # bottleneck was per-file and not per-byte in the smaller benchmark), # and the aggregate database should be 18 megabytes compressed or 47 # uncompressed. That's good enough to eat! # After adding the rest of the metadata, it takes 78 bytes per file # gzipped or 244 ungzipped; time to index 931 files totaling 21M has # increased to 6.17 seconds, or 150 files or 3.4 megabytes per second; # that's 21.5 mebibytes for an index of gentle's root partition, and # 2400 seconds or so to index it. # In actual fact, indexing gentle's entire filesystem produced an # index file that gzipped to 19 462 116 bytes (76 762 702 bytes # ungzipped), in 77 minutes 14.45 seconds wallclock time, with 39 # minutes 25.42 seconds user time and 13 minutes 37.88 seconds system # time. That's a total of 3183 seconds of CPU time to index 13 278 # 767 kiB of data, or 4.3 MB/cpusec. The file indexed 296 440 files, # so it was only able to index 93 files per CPU second. Wallclock # speeds were 64 files and 2.9 MB per second. The gzipped index file # used 65.7 bytes per file; gunzipped, it used 259 bytes per file. # All of this was on a 500MHz AMD K6-3. # BUGS: # - when it scans multiple hardlinks to the same file, it rereads the file # each time. # - it doesn't save major and minor numbers of device files, which makes it # usable but painful for system backups. # - nothing reads its output yet # - it doesn't save user and group names # - it doesn't yet generate THEX/Tiger tree hashes # - it prints some useless information --- e.g. sizes of directories # and symlinks # - it doesn't include metadata about the entire file. It would be nice to # know not just that some /bin/sh was a symlink to 'bash', but that at # 2003-03-30, the /bin/sh in gentle.canonical.org's filesystem was a symlink # to 'bash'. If you know that some '/bin/bash' once had 461400 # bytes that have an MD5 checksum of b5d4cad2a9edb1cd647b7ef86a2488, you # might then know whether that /bin/bash is the same one you wish you had # on your current filesystem. But if you know that that /bin/bash is the # one on panacea.canonical.org as of yesterday, then you can take some # useful action, like copying it into the place you wish it was! import sys, os, md5, sha, zlib, string, stat def dictstr(adict): return ''.join(["%s: %s\n" % (key, value) for key, value in adict.items()]) class checksums: def __init__(self, fileobj): self.adler32 = zlib.adler32("") self.crc32 = zlib.crc32("") self.md5 = md5.new() self.sha1 = sha.new() while 1: instr = fileobj.read(4096) if instr == '': break self.update(instr) fileobj.close() def update(self, instr): self.crc32 = zlib.crc32(instr, self.crc32) self.adler32 = zlib.adler32(instr, self.adler32) self.md5.update(instr) self.sha1.update(instr) def hexstring(self, astr): return string.join(["%x" % ord(char) for char in astr], '') def as_string(self): return dictstr({ 'Adler32': '%x' % self.adler32, 'CRC32': '%x' % self.crc32, 'MD5': self.hexstring(self.md5.digest()), 'SHA-1': self.sha1.hexdigest(), }) typemap = { 0010000: 'FIFO', # S_IFIFO 0020000: 'Character device', # S_IFCHR 0040000: 'Dir', # S_IFDIR 0060000: 'Block device', # S_IFBLK 0100000: 'File', # S_IFREG 0120000: 'Symlink', # S_IFLNK 0140000: 'Socket', # S_IFSOCK } class get_metadata: def __init__(self, pathname): self.statdata = os.lstat(pathname) self.mode = self.statdata[0] def is_regular_file(self): return stat.S_ISREG(self.mode) def islink(self): return stat.S_ISLNK(self.mode) def type(self): return typemap[stat.S_IFMT(self.mode)] def as_string(self): # lstat returns: mode ino dev nlink uid gid size atime mtime ctime # we ignore nlink, atime, and ctime, because they don't help with # backup or restore of files. return dictstr({ 'Permissions': '%05o' % (stat.S_IMODE(self.mode)), 'Inum.dev': '%d.%d' % (self.statdata[1], self.statdata[2]), 'uid.gid': '%d.%d' % (self.statdata[4], self.statdata[5]), 'Bytes': self.statdata[6], 'Mtime': self.statdata[8], 'Type': self.type(), }) def dumpfile(output, pathname): output.write(dictstr({'Path': `pathname`})) metadata = get_metadata(pathname) output.write(metadata.as_string()) if metadata.is_regular_file(): output.write(checksums(open(pathname)).as_string()) if metadata.islink(): output.write(dictstr({'Symlink': `os.readlink(pathname)`})) # XXX save device numbers on block and char devices? output.write("\n") def dumpfiles(output, dirname, filenames): for filename in filenames: dumpfile(output, "%s/%s" % (dirname, filename)) def main(argv): if len(argv) < 2: sys.stderr.write("Usage: %s dir [dir [dir ...]]\n" % argv[0]) return 1 for dirpath in argv[1:]: os.path.walk(dirpath, dumpfiles, sys.stdout) return 0 if __name__ == '__main__': sys.exit(main(sys.argv)) find-dups: #!/usr/local/bin/python # read output produced by fs-snapshot and find out which files occur in many # places # ook, this code horrifies. it breaks on filenames with spaces, due to a # quick-and-dirty implementation, and I don't have time to fix it before # sending it out. Sorry. import sys, operator, string, tempfile, os # these fields best when evaled: evalable = 'path', 'bytes', 'symlink' def read_file_rec(input): rv = {} while 1: line = input.readline() if line == '': return None if line == '\n': return rv while line[-1:] == '\n': line = line[:-1] name, value = line.split(':', 1) name = string.lower(name) value = string.lstrip(value) if name in evalable: value = eval(value) rv[name] = value if not filerecs: return None return filerecs.pop() def addpair(adict, key, val): if not adict.has_key(key): adict[key] = [] adict[key].append(val) def main(argv): if len(argv) != 1: sys.stderr.write("Usage: %s <filesystemlisting" % argv[0]) return 1 # My first pass just used a hash of hashes. It grew bigger than # my RAM and started thrashing. # read in data tmpfilename = tempfile.mktemp() tmpfile = open(tmpfilename, 'w') # XXX security hole while 1: filerec = read_file_rec(sys.stdin) if not filerec: break if filerec['type'] != 'File': continue tmpfile.write("%(sha-1)s %(inum.dev)s %(path)s %(bytes)s\n" % filerec) tmpfile.close() # break it out by SHA-1 os.system("sort -o %s %s" % (tmpfilename, tmpfilename)) tmpfile = open(tmpfilename) # sort it by number of occurrences tmpfilename2 = tempfile.mktemp() tmpfile2 = open(tmpfilename2, 'w') # XXX security hole oldsha1, inumpaths, oldbytes = None, [], None while 1: line = tmpfile.readline() if line: sha1, inumdev, path, bytes = string.split(line, ' ', 4) if sha1 != oldsha1: if oldsha1 is not None: tmpfile2.write("%08d %s %s %s" % (count, oldsha1, string.join(inumpaths), oldbytes)) oldsha1, inumpaths, oldbytes, count = sha1, [], bytes, 0 if not line: break count += 1 inumpaths.append(inumdev) inumpaths.append(path) tmpfile2.close() os.system("sort -r -o %s %s" % (tmpfilename2, tmpfilename2)) # output it tmpfile2 = open(tmpfilename2) while 1: line = tmpfile2.readline() if not line: break chunks = string.split(line) count, inumpaths, bytes = eval(chunks[0]), chunks[2:-1], chunks[-1] print "%d file%s of %d bytes:" % ( count, (count > 1 and 's' or ''), eval(bytes), ) harddict = {} while inumpaths: inumdev, path = inumpaths[:2] inumpaths = inumpaths[2:] if not harddict.has_key(inumdev): harddict[inumdev] = [] harddict[inumdev].append(path) for pathlist in harddict.values(): print " ", ' = '.join(pathlist) return 0 if __name__ == '__main__': sys.exit(main(sys.argv))