Sorry for the additional post, but it's a "fun" problem when you consider
the possible messed up file/path names that could be created by a raw
database dump. I made some improvements to the example script to make
sure messed up file/path names are handled correctly.
************************************************************************
#!/bin/ksh
# There are plenty of caveats to be aware of to make sure you process
# strings (paths and file names) correctly.
#
# The first is the built-in ksh 'echo' will strip backslashes by
# default, unless used with the '-E' switch. This is the opposite of
# bash where the backslashes are not stripped by default, but the '-e'
# switch forces stripping. The easy answer is to just use printf(1),
# rather than worry if you got the 'echo' correct.
#
# Another caveat is when using 'while read', you need to use the '-r'
# flag to prevent stripping of backslashes.
#
# Also, quoting your variables is highly recommended. This prevents a
# number of issues including odd behavior due to spaces in strings.
# ----------------------- test setup --------------------------------
# clean up
rm -rf d1 d2 d3
rm -rf "d\ 4"
rm -rf "d 5"
rm -rf "./-d"
rm -rf 'x\\ 2'
rm -rf '*x'
rm -rf 'x"x'
rm all.txt unq.txt dup.txt
# We need some seriously messed up directory names.
mkdir d1 d2 d3 "d\ 4" "d 5" "./-d" 'x\\ 2' '*x' 'x"x'
# And we need some seriously messed up file names in each directory.
for i in `ls -1 ../0[1,2,3,4,5].png`; do
cp "$i" "./d1/`printf "%s" "$i" | sed -e 's/0/1/' -e 's/^..\///'`";
cp "$i" "./d2/`printf "%s" "$i" | sed -e 's/0/2/' -e 's/^..\///'`";
cp "$i" "./d3/`printf "%s" "$i" | sed -e 's/0/3/' -e 's/^..\///'`";
cp "$i" "./d\ 4/`printf "%s" "$i" | sed -e 's/0/3/' -e 's/^..\///'`";
cp "$i" "./d 5/`printf "%s" "$i" | sed -e 's/0/3/' -e 's/^..\///'`";
cp "$i" "./-d/`printf "%s" "$i" | sed -e 's/0/-/' -e 's/^..\///'`";
cp "$i" './x\\ 2/'"`printf "%s" "$i" | sed -e 's/0/3/' -e 's/^..\///'`";
cp "$i" './*x/'"`printf "%s" "$i" | sed -e 's/0/3/' -e 's/^..\///'`";
cp "$i" './x"x/'"`printf "%s" "$i" | sed -e 's/0/?/' -e 's/^..\///'`";
done;
# ----------------------- Actually Useful ---------------------------
# Find all the files, and generate cksum for each
find ./d1 ./d2 ./d3 "./d\ 4/" "./d 5/" "./-d/" './x\\ 2/' '*x' 'x"x' \
-type f -print0 | xargs -0 cksum -r -a sha256 >all.txt
# For the sake of your sanity, you want the leading "./" or better,
# the fully qualified path to the directories where you want to
# run find(1). This will save you from a lot of possible mistakes
# caused by messed up directory and/or file names.
# After you have a cksum hash for every file, you want to make sure
# your list is sorted, or else other commands will fail since they
# typically expect lists to be pre-sorted.
sort -k 1,1 -o all.txt all.txt
# Generate a list of unique files based on the hash
sort -k 1,1 -u -o unq.txt all.txt
# To get a list of just the duplicates using comm(1)
comm -2 -3 all.txt unq.txt >dup.txt
# ----------------------- Possibly Useful ---------------------------
# Sure, once you have your list of duplicates, you're pretty well set
# assuming you're not afraid of hash collisions. If the data is very
# valuable, and you don't want to risk a hash collision causing you to
# delete something important, you need to use cmp(1) to make sure the
# files really are *idendical* duplicates.
IFS='
'
for UNQ_LINE in `cat unq.txt`; do
# Grab the hash from the line.
UNQ_HASH=`printf "%s" $UNQ_LINE | sed -E -e 's/ .+$//'`;
# Grab the full path and file name.
UNQ_FILE=`printf "%s" $UNQ_LINE | sed -E -e 's/^[a-f0-9]{64,64} //'`;
printf "\n";
printf "UNQ_HASH: %s\n" "$UNQ_HASH";
printf "UNQ_FILE: %s\n" "$UNQ_FILE";
# use the look(1) command to find matching hashes in the duplicates
for DUP_LINE in `look $UNQ_HASH dup.txt`; do
DUP_HASH=`printf "%s" $DUP_LINE | sed -E -e 's/ .+$//'`;
DUP_FILE=`printf "%s" $DUP_LINE | sed -E -e 's/^[a-f0-9]{64,64} //'`;
printf "DUP_HASH: %s\n" "$DUP_HASH";
printf "DUP_FILE: %s\n" "$DUP_FILE";
if `cmp "$UNQ_FILE" "$DUP_FILE"`; then
printf "Binary Compare Matches\n";
# rm "$DUP_FILE";
else
printf "ERROR: Hash Collision\n";
exit 1;
fi
done;
done;
# Another way to do it, without using sed.
# Reset the IFS back to space
IFS=' ';
cat unq.txt | while read -r UNQ_HASH UNQ_FILE; do
printf "\n";
printf "UNQ_HASH: %s\n" "$UNQ_HASH";
printf "UNQ_FILE: %s\n" "$UNQ_FILE";
look $UNQ_HASH dup.txt | while read -r DUP_HASH DUP_FILE; do
printf "DUP_HASH: %s\n" "$DUP_HASH";
printf "DUP_FILE: %s\n" "$DUP_FILE";
if `cmp "$UNQ_FILE" "$DUP_FILE"`; then
printf "Binary Compare Matches\n";
# rm "$DUP_FILE";
else
printf "ERROR: Hash Collision\n";
exit 1;
fi
done;
done;
exit 0
************************************************************************
--
J.C. Roberts