On Thu, Feb 08, 2001 at 12:14:33PM +0000, Dave Hodgkinson wrote:
> 
> Anyone got anything to hand that will spot massive duplications in a
> filesystem? I've got a whole bunch of servers mirrored to a backup
> server and it's be nice to identify where entire file trees have been
> replicated...

Curtesy of tomc ... if that is not an oxymoron :-)

I think it dates back to Perl3 ...
-- 
Chris Benson
#!/usr/bin/perl
# dupf -- find duplicate files.  
# [EMAIL PROTECTED]
# 28 july 1993
# nb: run with recent version of perl; i'm getting semi-colon lazy

@ARGV = ('.') unless @ARGV;
require 'find.pl';
&find(@ARGV);
sub wanted { $dupnames{$_} .= "\0$name" if -f; } 

foreach $name (sort by_basename keys %dupnames) {
    @files = split(/\0/, $dupnames{$name});
    shift @files;  # leading null field
    next if @files == 1;
    %size = %devino = %chksum = ();

    # compare sizes; if any found, we'll have to 
    foreach $file (@files) {
        $size = -s $file;
        $size{$file} = $size;
        ($dev,$ino) = stat(_);          # save cached info 
        $devino{$file} = "$dev,$ino";
    } 
    @slist = reverse sort by_sameness @files;
    if ($- < @slist) { $- = 0; } # force new page if won't fit
    $prev = undef;
    foreach $file (@slist) { 
        $flag = ' ';
        if ($prev) {
            $flag = do {
                if    ($devino{$prev} eq $devino{$file}) { 'L' }
                elsif (&chksum($prev) == &chksum($file)) { 'C' }
                elsif ($size{$prev}   == $size{$file})   { 'S' } 
                else                                     { ' ' }
            }
        } 
        write;
        $prev = $file;
    }
    &blank;
} 

format STDOUT_TOP = 
DevIno          Size   Chksum  F  Filename
----------  --------   ------  -  --------
.

format STDOUT = 
@<<<<<<<<<  @>>>>>>>   @>>>>>  @  @*
$devino{$file}, $size{$file}, &chksum($file), $flag, $file
.

format BLANK = 

.

sub by_sameness {
    &chksum($a) <=> &chksum($b) 
        ||
    $devino{$a} cmp $devino{$b} 
        ||
    $size{$a}   <=> $size{$b}  
        || 
    $a          cmp $b;

} 
sub by_basename {
    substr($a, rindex($a, '/')+1) cmp substr($b, rindex($b, '/')+1);
} 
sub chksum {
    local($pathname) = @_;
    local($/) = undef; local($_);

    if (defined $chksum{$pathname}) { return $chksum{$pathname} } 

    if (!open (FILE, $pathname)) {
        warn "can't open $pathname for chksum: $!\n";
        return undef;
    } 
    $_ = <FILE>;
    $chksum{$pathname} = unpack("%16C*", $_);
} 
sub blank { local($~) = BLANK; write; } 

Reply via email to