Sorry for the additional post, but it's a "fun" problem when you consider the possible messed up file/path names that could be created by a raw database dump. I made some improvements to the example script to make sure messed up file/path names are handled correctly.
************************************************************************ #!/bin/ksh # There are plenty of caveats to be aware of to make sure you process # strings (paths and file names) correctly. # # The first is the built-in ksh 'echo' will strip backslashes by # default, unless used with the '-E' switch. This is the opposite of # bash where the backslashes are not stripped by default, but the '-e' # switch forces stripping. The easy answer is to just use printf(1), # rather than worry if you got the 'echo' correct. # # Another caveat is when using 'while read', you need to use the '-r' # flag to prevent stripping of backslashes. # # Also, quoting your variables is highly recommended. This prevents a # number of issues including odd behavior due to spaces in strings. # ----------------------- test setup -------------------------------- # clean up rm -rf d1 d2 d3 rm -rf "d\ 4" rm -rf "d 5" rm -rf "./-d" rm -rf 'x\\ 2' rm -rf '*x' rm -rf 'x"x' rm all.txt unq.txt dup.txt # We need some seriously messed up directory names. mkdir d1 d2 d3 "d\ 4" "d 5" "./-d" 'x\\ 2' '*x' 'x"x' # And we need some seriously messed up file names in each directory. for i in `ls -1 ../0[1,2,3,4,5].png`; do cp "$i" "./d1/`printf "%s" "$i" | sed -e 's/0/1/' -e 's/^..\///'`"; cp "$i" "./d2/`printf "%s" "$i" | sed -e 's/0/2/' -e 's/^..\///'`"; cp "$i" "./d3/`printf "%s" "$i" | sed -e 's/0/3/' -e 's/^..\///'`"; cp "$i" "./d\ 4/`printf "%s" "$i" | sed -e 's/0/3/' -e 's/^..\///'`"; cp "$i" "./d 5/`printf "%s" "$i" | sed -e 's/0/3/' -e 's/^..\///'`"; cp "$i" "./-d/`printf "%s" "$i" | sed -e 's/0/-/' -e 's/^..\///'`"; cp "$i" './x\\ 2/'"`printf "%s" "$i" | sed -e 's/0/3/' -e 's/^..\///'`"; cp "$i" './*x/'"`printf "%s" "$i" | sed -e 's/0/3/' -e 's/^..\///'`"; cp "$i" './x"x/'"`printf "%s" "$i" | sed -e 's/0/?/' -e 's/^..\///'`"; done; # ----------------------- Actually Useful --------------------------- # Find all the files, and generate cksum for each find ./d1 ./d2 ./d3 "./d\ 4/" "./d 5/" "./-d/" './x\\ 2/' '*x' 'x"x' \ -type f -print0 | xargs -0 cksum -r -a sha256 >all.txt # For the sake of your sanity, you want the leading "./" or better, # the fully qualified path to the directories where you want to # run find(1). This will save you from a lot of possible mistakes # caused by messed up directory and/or file names. # After you have a cksum hash for every file, you want to make sure # your list is sorted, or else other commands will fail since they # typically expect lists to be pre-sorted. sort -k 1,1 -o all.txt all.txt # Generate a list of unique files based on the hash sort -k 1,1 -u -o unq.txt all.txt # To get a list of just the duplicates using comm(1) comm -2 -3 all.txt unq.txt >dup.txt # ----------------------- Possibly Useful --------------------------- # Sure, once you have your list of duplicates, you're pretty well set # assuming you're not afraid of hash collisions. If the data is very # valuable, and you don't want to risk a hash collision causing you to # delete something important, you need to use cmp(1) to make sure the # files really are *idendical* duplicates. IFS=' ' for UNQ_LINE in `cat unq.txt`; do # Grab the hash from the line. UNQ_HASH=`printf "%s" $UNQ_LINE | sed -E -e 's/ .+$//'`; # Grab the full path and file name. UNQ_FILE=`printf "%s" $UNQ_LINE | sed -E -e 's/^[a-f0-9]{64,64} //'`; printf "\n"; printf "UNQ_HASH: %s\n" "$UNQ_HASH"; printf "UNQ_FILE: %s\n" "$UNQ_FILE"; # use the look(1) command to find matching hashes in the duplicates for DUP_LINE in `look $UNQ_HASH dup.txt`; do DUP_HASH=`printf "%s" $DUP_LINE | sed -E -e 's/ .+$//'`; DUP_FILE=`printf "%s" $DUP_LINE | sed -E -e 's/^[a-f0-9]{64,64} //'`; printf "DUP_HASH: %s\n" "$DUP_HASH"; printf "DUP_FILE: %s\n" "$DUP_FILE"; if `cmp "$UNQ_FILE" "$DUP_FILE"`; then printf "Binary Compare Matches\n"; # rm "$DUP_FILE"; else printf "ERROR: Hash Collision\n"; exit 1; fi done; done; # Another way to do it, without using sed. # Reset the IFS back to space IFS=' '; cat unq.txt | while read -r UNQ_HASH UNQ_FILE; do printf "\n"; printf "UNQ_HASH: %s\n" "$UNQ_HASH"; printf "UNQ_FILE: %s\n" "$UNQ_FILE"; look $UNQ_HASH dup.txt | while read -r DUP_HASH DUP_FILE; do printf "DUP_HASH: %s\n" "$DUP_HASH"; printf "DUP_FILE: %s\n" "$DUP_FILE"; if `cmp "$UNQ_FILE" "$DUP_FILE"`; then printf "Binary Compare Matches\n"; # rm "$DUP_FILE"; else printf "ERROR: Hash Collision\n"; exit 1; fi done; done; exit 0 ************************************************************************ -- J.C. Roberts