On Thu, Feb 08, 2001 at 12:14:33PM +0000, Dave Hodgkinson wrote:
>
> Anyone got anything to hand that will spot massive duplications in a
> filesystem? I've got a whole bunch of servers mirrored to a backup
> server and it's be nice to identify where entire file trees have been
> replicated...
Curtesy of tomc ... if that is not an oxymoron :-)
I think it dates back to Perl3 ...
--
Chris Benson
#!/usr/bin/perl
# dupf -- find duplicate files.
# [EMAIL PROTECTED]
# 28 july 1993
# nb: run with recent version of perl; i'm getting semi-colon lazy
@ARGV = ('.') unless @ARGV;
require 'find.pl';
&find(@ARGV);
sub wanted { $dupnames{$_} .= "\0$name" if -f; }
foreach $name (sort by_basename keys %dupnames) {
@files = split(/\0/, $dupnames{$name});
shift @files; # leading null field
next if @files == 1;
%size = %devino = %chksum = ();
# compare sizes; if any found, we'll have to
foreach $file (@files) {
$size = -s $file;
$size{$file} = $size;
($dev,$ino) = stat(_); # save cached info
$devino{$file} = "$dev,$ino";
}
@slist = reverse sort by_sameness @files;
if ($- < @slist) { $- = 0; } # force new page if won't fit
$prev = undef;
foreach $file (@slist) {
$flag = ' ';
if ($prev) {
$flag = do {
if ($devino{$prev} eq $devino{$file}) { 'L' }
elsif (&chksum($prev) == &chksum($file)) { 'C' }
elsif ($size{$prev} == $size{$file}) { 'S' }
else { ' ' }
}
}
write;
$prev = $file;
}
␣
}
format STDOUT_TOP =
DevIno Size Chksum F Filename
---------- -------- ------ - --------
.
format STDOUT =
@<<<<<<<<< @>>>>>>> @>>>>> @ @*
$devino{$file}, $size{$file}, &chksum($file), $flag, $file
.
format BLANK =
.
sub by_sameness {
&chksum($a) <=> &chksum($b)
||
$devino{$a} cmp $devino{$b}
||
$size{$a} <=> $size{$b}
||
$a cmp $b;
}
sub by_basename {
substr($a, rindex($a, '/')+1) cmp substr($b, rindex($b, '/')+1);
}
sub chksum {
local($pathname) = @_;
local($/) = undef; local($_);
if (defined $chksum{$pathname}) { return $chksum{$pathname} }
if (!open (FILE, $pathname)) {
warn "can't open $pathname for chksum: $!\n";
return undef;
}
$_ = <FILE>;
$chksum{$pathname} = unpack("%16C*", $_);
}
sub blank { local($~) = BLANK; write; }