def visit(files_by_size, dirname, names):
for name in names:
path = os.path.join(dirname, name)
if os.path.isfile(path):
size = os.path.getsize(path)
if size in files_by_size:
files_by_size[size].append(path)
else:
files_by_size[size] = [path]
if __name__ == '__main__':
for dirname in sys.argv[1:]:
files_by_size = {}
files_by_hash = {}
os.path.walk(dirname, visit, files_by_size)
for size, paths in files_by_size.items():
if len(paths) > 1:
for path in paths:
with open(path) as f:
hash = hashlib.md5(f.read()).hexdigest()
f.close()
if hash in files_by_hash:
files_by_hash[hash].append(path)
else:
files_by_hash[hash] = [path]
for hash, paths in files_by_hash.items():
if len(paths) > 1:
for path in paths:
print '%s %s' % (hash, path)
find, sort, uniq, xargs; that's four tools each written in 9001 lines of C to do something that can be done in 37 lines of Python WITHOUT taking the checksum of every file TWICE and sorting it TWICE. No way in hell I'm running that on 800 GB of backup data.