Re: Tool to compare directories and delete duplicate files from one directory
Date: Fri, 19 May 2023 17:19:33 UTC
> ----------------------------------------
> From: Sysadmin Lists <sysadmin.lists@mailfence.com>
> Date: May 14, 2023, 3:48:52 PM
> To: <questions@freebsd.org>
> Subject: Re: Tool to compare directories and delete duplicate files from one directory
>
> "[...] looking for files that have the same name anywhere in the set of dirs,
then comparing their sizes [...] that's way easier to program."
Would have been, but I decided to mimic fdupes and jdupes, instead. I figured,
in order for a scripted language to have any chance against a compiled
C-program it would need to make comparatively fewer hash-checks, and for awk
specifically, to use awk's array-membership test (if (elem in array)) instead
of for-looping over lists of files and performing string-comparisons. I used a
mixture of both.
Performance is pretty good:
$ time dedup_multidirs.sh -V dedup{1..13}
DEBUG: 313087 files, 3497 duplicates, 309590 unique, 42848 stat calls
# 773723 differences: same filenames, different sizes or hashes
real 1m32.719s
user 0m50.671s
sys 0m44.054s
$ du -xs dedup{1..13} | awk '{ sum = sum + $1 } END { print sum }'
219195746 # 200G+ of data
What I do is:
i) only process files that have more than one name-match in the set of dirs
ii) use least-expensive tests first: `stat' to check file inodes and sizes
iii) generate and store hashes only after a possible duplicate is found
iv) perform file-dir membership tests to abort for-loops early
v) keep track of number of per-filename children in each dir to abort early
vi) "if A = B, and A = C, then B = C" ie, we report A and B are dups, and
A and C are dups, but not B and C; reduces a comparison by one
vii) likewise, "if A != B, and A = C, then B != C"; saves a comparison again
- if a duplicate is found, the user is prompted to delete the lower-priority
dup, determined by order of arguments: highest first, lowest last (dir1 dir2).
- if a name-only match is found (different sizes or hashes), it's reported but
not deleted.
- Safety First: safeguards are turned on by default and must be turned off
using options: N = no-dryrun, V = no messages, I = no prompt to delete files,
with an optional DEBUG=1 to turn on debugging: DEBUG=1 ./program dir1 dir2 ...
- to dedup a single dir, list it twice: ./program dir1 dir1
Resource usage is tiny: each system() call is closed immediately, releasing its
resources. Although thousands of `stat' and `xxhash' calls are made, they're
made sequentially, and the memory foot-print of the entire run is minimal.
During testing, the program used less than 125M of RAM, and disk I/O was tiny.
Caveat: it's probably riddled with bugs that my limited testing didn't catch.
Limitations: two files with different names are not analyzed, even if their
contents are identical: cp -a file1 file2
Probably has limited applicability, but was a fun head-scratcher (to do well).
Final caveat: it might not even do what I described here, but it wants to! :P
Flame-war starter: would love to see you Perl-fanbois beat those numbers. ;)
#!/bin/sh -e
# remove or report duplicate files: $0 [-VIN] dir1 dir2 ... dir[n]
if [ "X${1%%-*}" = "X" ]; then opts=$1; shift; fi
echo "Building files list from: ${@}"
find "${@}" -xdev -type f | awk -v opts=$opts -v e=$DEBUG '
BEGIN { stat = "stat -f \"%i %z\" "; rm = "rm -i "
if (opts) { split(opts, flags, ""); for (f in flags) o[flags[f]] }
if ("V" in o) V = 1; if ("I" in o) rm = "rm -v "; if ("N" in o) N = 1
for (x = 1; x < ARGC; x++) args = args ? args "|" ARGV[x] : ARGV[x]
if (ARGV[1] == ARGV[2]) { ch = 2 } else ch = 1; ARGC = 0
}
{ filename = substr($0, match($0, /[^\/]+$/))
dups[filename, ++files[filename]] = $0
basepath = substr($0, match($0, "(" args ")/?"), RLENGTH)
sub("/*$", "/", basepath); hasf[basepath, filename]++
total++
}
END { for (i in ARGV) sub("/*$", "/", ARGV[i])
system("date")
print "Comparing files."
for (file in files)
if (files[file] > ch)
for (i = 1; i < x; i++) {
if (ARGV[i] SUBSEP file in hasf)
for (j = 1; j < files[file]; j++)
if (dups[file, j] ~ "^" ARGV[i] &&
! (dups[file, j] in processed)) {
for (k = i +1; k < x; k++) {
if (ARGV[k] SUBSEP file in hasf) {
getstats(dups[file, j])
inode = $1; size = $2
for (l = 1; l <= files[file]; l++) {
if (e) debug(1)
if (dups[file, j] == dups[file, l])
continue
if (dups[file, l] ~ "^" ARGV[k]) {
if (dups[file, l] in processed)
continue
f = dups[file, j]
d = dups[file, l]
getstats(d)
if (e) debug(2)
if (inode == $1) { }
else if (size == $2) {
hashcheck(f, d)
processed[d]
hits++ }
else act("diff")
if (c++ == hasf[ARGV[k], file])
break
} } } }
if (e) debug(3)
processed[dups[file, j]]; delete dups[file, j]
} }
system("date")
printf("DEBUG: %d %s, %d %s, %d %s, %d %s\n",
total, "files", hits, "duplicates",
(total - hits), "unique", statcalls,"stat calls")
}
function act(message) {
if (e) debug(4)
if (message == "dup") { if (!V) print "duplicates: " d, "\t", f
if ( N) system(rm "\"" d "\" </dev/tty") }
else if (!V) {
printf("difference: %8-s %10-s %s\t%s\n", "inode","size","hash","file")
printf("%12s %s %30s\n", " ", stats[f], f)
printf("%12s %s %30s\n", " ", stats[d], d) }
}
function debug(y, fname) {
if (y == 1) printf("DEBUG: for-loop(1) %-45s %-20s %s\n"," F|"dups[file, j],
" D|"dups[file, l], "H|"hasf[ARGV[k], file])
if (y == 2) printf("DEBUG: for-loop(2) %-45s %-20s %s %s\n", "Fi|"f" "inode,
"Di|"d" "$1, "Fs|" size, "Ds|" $2)
if (y == 3) print "DEBUG: for-loop(3)", " F|"dups[file, j],
"mark processed, delete elem"
if (y == 4) print "DEBUG: act()"
if (y == 5) printf("DEBUG: getstats() %-28s %-28s\n", " F|"fname,
"Fs|"stats[fname])
if (y == 6) printf("DEBUG: hashcheck() %-28s %-28s %-28s %-28s\n",
" F|"f, " D|"d, "Ft|"stats[f], "Dt|"stats[d])
}
function getstats(fname) {
if (e) debug(5, fname)
if (split(stats[fname], arr) < 2) { stat "\"" fname "\"" | getline
close(stat "\"" fname "\"")
stats[fname] = $1 " " $2
statcalls++ }
else { $1 = arr[1]; $2 = arr[2] }
if (e) debug(5, fname)
}
function hashcheck(fname, dname) {
if (e) debug(6)
if (split(stats[fname], farr) < 3) {
"xxhsum \"" fname "\"" | getline; close("xxhsum \"" fname "\"")
stats[fname] = stats[fname] " " (farr[3] = $1) }
if (split(stats[dname], darr) < 3) {
"xxhsum \"" dname "\"" | getline; close("xxhsum \"" dname "\"")
stats[dname] = stats[dname] " " (darr[3] = $1) }
if (e) debug(6)
if (farr[3] == darr[3]) { act("dup") } else act("diff")
}' "${@}"
--
Sent with https://mailfence.com
Secure and private email