This repository has been archived by the owner on Oct 17, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfind-duplicate-chunks
executable file
·108 lines (86 loc) · 3.13 KB
/
find-duplicate-chunks
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#!/usr/bin/env python
#
# Report duplicate chunks of data in a filesystem.
import hashlib
import os
import subprocess
import sys
import tempfile
def compute(f, chunk_size, offset):
chunk = f.read(chunk_size)
while len(chunk) == chunk_size:
yield hashlib.md5(chunk).hexdigest()
chunk = chunk[offset:] + f.read(offset)
def compute_checksums(f, chunk_size, offset, dirname):
for dirname, subdirs, filenames in os.walk(dirname):
for filename in filenames:
pathname = os.path.join(dirname, filename)
if os.path.isfile(pathname) and not os.path.islink(pathname):
ff = file(pathname)
for checksum in compute(ff, chunk_size, offset):
f.write('%s\n' % checksum)
ff.close()
def sort_checksums(f, checksums_name):
subprocess.check_call(['sort',
'-T', '.',
'--batch-size', '1000',
'-S', '1G',
],
stdin=file(checksums_name),
stdout=f)
def count_duplicates(f, sorted_name):
subprocess.check_call(['uniq', '-c'], stdin=file(sorted_name), stdout=f)
def make_report(f, counts_name, chunk_size, offset):
num_diff_checksums = 0
saved = 0
total = 0
limits = [1]
counts = { 1: 0 }
for line in file(counts_name):
count, checksum = line.split()
count = int(count)
num_diff_checksums += 1
saved += (count-1) * chunk_size
total += count * chunk_size
while limits[-1] < count:
n = limits[-1] * 10
limits.append(n)
counts[n] = 0
for limit in limits:
if count <= limit:
counts[limit] += count
break
f.write('chunk size: %d\n' % chunk_size)
f.write('offset: %d\n' % offset)
f.write('#different checksums: %d\n' % num_diff_checksums)
f.write('%8s %8s\n' % ('repeats', 'how many'))
for limit in limits:
f.write('%8d %8d\n' % (limit, counts[limit]))
f.write('bytes saved by de-duplication: %d\n' % saved)
f.write('%% saved: %f\n' % (100.0*saved/total))
def main():
chunk_size = int(sys.argv[1])
offset = int(sys.argv[2])
dirname = sys.argv[3]
prefix = 'data-%04d-%04d' % (chunk_size, offset)
checksums_name = prefix + '.checksums'
sorted_name = prefix + '.sorted'
counts_name = prefix + '.counts'
report_name = prefix + '.report'
steps = (
(checksums_name, compute_checksums, (chunk_size, offset, dirname)),
(sorted_name, sort_checksums, (checksums_name,)),
(counts_name, count_duplicates, (sorted_name,)),
(report_name, make_report, (counts_name, chunk_size, offset)),
)
for filename, func, args in steps:
if not os.path.exists(filename):
print 'Step:', func.__name__
fd, output_name = tempfile.mkstemp(dir='.')
os.close(fd)
f = file(output_name, 'w')
func(*((f,) + args))
f.close()
os.rename(output_name, filename)
if __name__ == '__main__':
main()