I described some of the problems I have with file and backup
management in a kragen-tol post in March 2002, entitled "file
catalogs and backups":
http://lists.canonical.org/pipermail/kragen-tol/2002-March/000691.html
http://www.mail-archive.com/[EMAIL PROTECTED]/msg00037.html

Now, spurred by recent major hardware failures in my life, I've
implemented two pieces of the grandiose system I laid out there: 

- a filesystem indexer that spits out all the metadata for my files
  (although it doesn't yet look inside of tar and zip files for more
  files);
- a broken program to answer the simplest interesting question I could
  think of with such a catalog: what files are duplicated in more than one
  place on the filesystem?

They follow.

fs-snapshot:

#!/usr/local/bin/python
# extract enough info about your filesystem that you could, at least
# in principle, tell if it had changed, or reconstruct its old state,
# if you had the files lying around somewhere; yet this info should
# remain relatively small, so that it's practical to keep it close at
# hand.

# With just pathname, adler32, CRC32, MD5, and SHA-1, this data adds
# up to about 64 bytes per file gzipped, or 164 ungzipped, and it
# processes about 189 files (or 4.27 megs) per second.  This means
# that gentle's root partition's 288613 files consuming 8.2 gigabytes
# should take no more than 1920 seconds to index (or 1527 if the
# bottleneck was per-file and not per-byte in the smaller benchmark),
# and the aggregate database should be 18 megabytes compressed or 47
# uncompressed.  That's good enough to eat!

# After adding the rest of the metadata, it takes 78 bytes per file
# gzipped or 244 ungzipped; time to index 931 files totaling 21M has
# increased to 6.17 seconds, or 150 files or 3.4 megabytes per second;
# that's 21.5 mebibytes for an index of gentle's root partition, and
# 2400 seconds or so to index it.

# In actual fact, indexing gentle's entire filesystem produced an
# index file that gzipped to 19 462 116 bytes (76 762 702 bytes
# ungzipped), in 77 minutes 14.45 seconds wallclock time, with 39
# minutes 25.42 seconds user time and 13 minutes 37.88 seconds system
# time.  That's a total of 3183 seconds of CPU time to index 13 278
# 767 kiB of data, or 4.3 MB/cpusec.  The file indexed 296 440 files,
# so it was only able to index 93 files per CPU second.  Wallclock
# speeds were 64 files and 2.9 MB per second.  The gzipped index file
# used 65.7 bytes per file; gunzipped, it used 259 bytes per file.

# All of this was on a 500MHz AMD K6-3.

# BUGS:
# - when it scans multiple hardlinks to the same file, it rereads the file
#   each time.
# - it doesn't save major and minor numbers of device files, which makes it
#   usable but painful for system backups.
# - nothing reads its output yet
# - it doesn't save user and group names
# - it doesn't yet generate THEX/Tiger tree hashes
# - it prints some useless information --- e.g. sizes of directories
#   and symlinks
# - it doesn't include metadata about the entire file.  It would be nice to
#   know not just that some /bin/sh was a symlink to 'bash', but that at
#   2003-03-30, the /bin/sh in gentle.canonical.org's filesystem was a symlink
#   to 'bash'.  If you know that some '/bin/bash' once had 461400
#   bytes that have an MD5 checksum of b5d4cad2a9edb1cd647b7ef86a2488, you
#   might then know whether that /bin/bash is the same one you wish you had
#   on your current filesystem.  But if you know that that /bin/bash is the
#   one on panacea.canonical.org as of yesterday, then you can take some
#   useful action, like copying it into the place you wish it was!

import sys, os, md5, sha, zlib, string, stat

def dictstr(adict):
    return ''.join(["%s: %s\n" % (key, value) for key, value in adict.items()])

class checksums:
    def __init__(self, fileobj):
        self.adler32 = zlib.adler32("")
        self.crc32 = zlib.crc32("")
        self.md5 = md5.new()
        self.sha1 = sha.new()

        while 1:
            instr = fileobj.read(4096)
            if instr == '': break
            self.update(instr)
        fileobj.close()
    def update(self, instr):
        self.crc32 = zlib.crc32(instr, self.crc32)
        self.adler32 = zlib.adler32(instr, self.adler32)
        self.md5.update(instr)
        self.sha1.update(instr)
    def hexstring(self, astr):
        return string.join(["%x" % ord(char) for char in astr], '')
    def as_string(self):
        return dictstr({
            'Adler32': '%x' % self.adler32,
            'CRC32': '%x' % self.crc32,
            'MD5': self.hexstring(self.md5.digest()),
            'SHA-1': self.sha1.hexdigest(),
            })

typemap = {
    0010000: 'FIFO',              # S_IFIFO
    0020000: 'Character device',  # S_IFCHR
    0040000: 'Dir',               # S_IFDIR
    0060000: 'Block device',      # S_IFBLK
    0100000: 'File',              # S_IFREG
    0120000: 'Symlink',           # S_IFLNK
    0140000: 'Socket',            # S_IFSOCK
    }

class get_metadata:
    def __init__(self, pathname):
        self.statdata = os.lstat(pathname)
        self.mode = self.statdata[0]
    def is_regular_file(self):
        return stat.S_ISREG(self.mode)
    def islink(self):
        return stat.S_ISLNK(self.mode)
    def type(self):
        return typemap[stat.S_IFMT(self.mode)]
    def as_string(self):
        # lstat returns: mode ino dev nlink uid gid size atime mtime ctime
        # we ignore nlink, atime, and ctime, because they don't help with
        # backup or restore of files.
        return dictstr({
            'Permissions': '%05o' % (stat.S_IMODE(self.mode)),
            'Inum.dev': '%d.%d' % (self.statdata[1], self.statdata[2]),
            'uid.gid': '%d.%d' % (self.statdata[4], self.statdata[5]),
            'Bytes': self.statdata[6],
            'Mtime': self.statdata[8],
            'Type': self.type(),
            })

def dumpfile(output, pathname):
    output.write(dictstr({'Path': `pathname`}))
    metadata = get_metadata(pathname)
    output.write(metadata.as_string())
    if metadata.is_regular_file():
        output.write(checksums(open(pathname)).as_string())
    if metadata.islink():
        output.write(dictstr({'Symlink': `os.readlink(pathname)`}))
    # XXX save device numbers on block and char devices?
    output.write("\n")

def dumpfiles(output, dirname, filenames):
    for filename in filenames:
        dumpfile(output, "%s/%s" % (dirname, filename))

def main(argv):
    if len(argv) < 2:
        sys.stderr.write("Usage: %s dir [dir [dir ...]]\n" % argv[0])
        return 1
    for dirpath in argv[1:]:
        os.path.walk(dirpath, dumpfiles, sys.stdout)
    return 0

if __name__ == '__main__': sys.exit(main(sys.argv))



find-dups:

#!/usr/local/bin/python
# read output produced by fs-snapshot and find out which files occur in many
# places

# ook, this code horrifies.  it breaks on filenames with spaces, due to a
# quick-and-dirty implementation, and I don't have time to fix it before
# sending it out.  Sorry.

import sys, operator, string, tempfile, os

# these fields best when evaled:
evalable = 'path', 'bytes', 'symlink'

def read_file_rec(input):
    rv = {}
    while 1:
        line = input.readline()
        if line == '': return None
        if line == '\n': return rv
        while line[-1:] == '\n': line = line[:-1]
        name, value = line.split(':', 1)
        name = string.lower(name)
        value = string.lstrip(value)
        if name in evalable: value = eval(value)
        rv[name] = value
        
    if not filerecs: return None
    return filerecs.pop()

def addpair(adict, key, val):
    if not adict.has_key(key): adict[key] = []
    adict[key].append(val)

def main(argv):
    if len(argv) != 1:
        sys.stderr.write("Usage: %s <filesystemlisting" % argv[0])
        return 1

    # My first pass just used a hash of hashes.  It grew bigger than
    # my RAM and started thrashing.

    # read in data
    tmpfilename = tempfile.mktemp()
    tmpfile = open(tmpfilename, 'w')  # XXX security hole
    while 1:
        filerec = read_file_rec(sys.stdin)
        if not filerec: break
        if filerec['type'] != 'File': continue
        tmpfile.write("%(sha-1)s %(inum.dev)s %(path)s %(bytes)s\n" % filerec)
    tmpfile.close()

    # break it out by SHA-1
    os.system("sort -o %s %s" % (tmpfilename, tmpfilename))
    tmpfile = open(tmpfilename)

    # sort it by number of occurrences
    tmpfilename2 = tempfile.mktemp()
    tmpfile2 = open(tmpfilename2, 'w') # XXX security hole
    oldsha1, inumpaths, oldbytes = None, [], None
    while 1:
        line = tmpfile.readline()
        if line:
            sha1, inumdev, path, bytes = string.split(line, ' ', 4)
        if sha1 != oldsha1:
            if oldsha1 is not None:
                tmpfile2.write("%08d %s %s %s" % (count, oldsha1,
                                                    string.join(inumpaths),
                                                    oldbytes))
            oldsha1, inumpaths, oldbytes, count = sha1, [], bytes, 0
        if not line: break
        count += 1
        inumpaths.append(inumdev)
        inumpaths.append(path)
    tmpfile2.close()
    os.system("sort -r -o %s %s" % (tmpfilename2, tmpfilename2))

    # output it
    tmpfile2 = open(tmpfilename2)
    while 1:
        line = tmpfile2.readline()
        if not line: break
        chunks = string.split(line)
        count, inumpaths, bytes = eval(chunks[0]), chunks[2:-1], chunks[-1]
        print "%d file%s of %d bytes:" % (
            count, (count > 1 and 's' or ''), eval(bytes),
            )
        harddict = {}
        while inumpaths:
            inumdev, path = inumpaths[:2]
            inumpaths = inumpaths[2:]
            if not harddict.has_key(inumdev): harddict[inumdev] = []
            harddict[inumdev].append(path)
        for pathlist in harddict.values():
            print "   ", ' = '.join(pathlist)

    return 0

if __name__ == '__main__': sys.exit(main(sys.argv))

Reply via email to