-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathchksumtree.py
227 lines (186 loc) · 7.28 KB
/
chksumtree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
#!/usr/bin/python
import sys
import pickle
import os
import hashlib
import pprint
import time
from optparse import OptionParser
VERSION=1.0
def parseOptions():
usage = """
%prog [options]\n
Scrub a given directory by calculating the md5 hash of every file and compare
it with the one stored in the datfile. If a file's mtime has changed, the md5
in the datfile will be updated. If the md5s are different and the mtime hasn't
changed, an Exception will be raised. """
parser = OptionParser(usage=usage)
parser.add_option("-v", "--verbose",
action="store_true",
dest="verbose",
default=False,
help="Verbose output")
parser.add_option("-n", "--noaction",
action="store_true",
dest="dryrun",
default=False,
help="Dry run. No action will be taken.")
parser.add_option("-p", "--path",
action="store",
dest="path",
help="Path to walk")
parser.add_option("-d", "--data",
action="store",
dest="data_file",
default=".chksumdat",
help="Data file to store path checksums")
parser.add_option("-b", "--buffer",
action="store",
dest="read_buffer",
type="int",
default="8192",
help="Read buffer used when calculating the md5sum in bytes")
(options, args) = parser.parse_args()
return options
class Filechksum():
def __init__(self, options, path):
'''
Filechksum.path = full path to file
Filechksum.md5sum = checksum for file
Filechksum.stat = stat for file
'''
self.path = path
self.md5sum = md5sum(path, options.read_buffer)
self.stat = os.stat(file)
def md5sum(file, read_buffer):
''' Get the md5 of a file '''
md5 = hashlib.md5()
f = open(file,'rb')
for chunk in iter(lambda: f.read(read_buffer), ''):
md5.update(chunk)
f.close()
return md5.hexdigest()
class Treechksum():
def __init__(self, options, datfile, path):
'''
Treechksum.datfile = filename in path to load/write checksum data to.
Treechksum.chksums = dict of checksum data.
Treechksum.path = full path of tree to checksum
'''
self.datfile = os.path.join(path, datfile)
self.path = path
self.cksums = {}
self._read(options)
def _read(self, options):
'''
Read the datfile
'''
if os.path.exists(self.datfile):
print "Dat file found successfully"
f = open(self.datfile)
(v, self.cksums) = pickle.load(f)
f.close()
if options.verbose: pprint.pprint(self.cksums)
else:
#raise Exception("%s does not exist" % self._file)
print "%s does not exist. Creating new one." % self.datfile
if v != VERSION:
raise Exception("Wrong version. Please delete %s" % self.datfile)
def save(self):
'''
Save the datfile.
'''
f = open(self.datfile, "wa")
pickle.dump((VERSION, self.cksums), f)
f.close()
def compute(self, options):
'''
Actually do the work. Walk the given directory, compute md5s,
diff it with the known md5, if the mtime is the same and the md5s
are the same, you're good. If mtime is different, update the file's
md5 in the datfile. GC removed files from the datfile to save space.
'''
seen = []
total_keys = len(self.cksums.keys())
count = 0
for (root, dirs, files) in os.walk(self.path):
for file in files:
# chomp the full path
if file in [".DS_Store", self.datfile[len(self.path):]]:
continue
in_file = os.path.join(root, file)
if not os.path.isfile(in_file):
continue
# add it to the files we've seen
# so we can subtract it from the dict
# to gc the deleted ones
seen.append(self._get_rel_path(in_file))
self._checkfile(in_file, options)
count = count + 1
if not options.verbose: self._printprogress(count, total_keys)
self._gc(seen)
print "\n"
def _get_rel_path(self, in_file):
if in_file.startswith(self.path):
rel_path = in_file[len(self.path):].lstrip("/")
else:
rel_path = in_file.lstrip("/")
return rel_path
def _checkfile(self, in_file, options):
'''
Add new files, check existing files, and update modified files.
'''
in_file_cksum = {'stat': os.stat(in_file),
'md5': md5sum(in_file, options.read_buffer)}
if options.verbose: print in_file
rel_path = self._get_rel_path(in_file)
if options.verbose:
print rel_path
f = self.cksums.get(rel_path)
if f == None:
# New file.
print "%s was added." % rel_path
self.cksums[rel_path] = in_file_cksum
else:
# check fi the file was updated
if (f['stat'].st_mtime == in_file_cksum['stat'].st_mtime):
# stat is the same. check md5
if f['md5'] != in_file_cksum['md5']:
# Fuck
raise Exception("%s changed from %s to %s" % (rel_path,
f['md5'],
in_file_cksum['md5']))
else:
# All good in the hood
if options.verbose: print "%s passes md5 %s" % (rel_path,
in_file_cksum['md5'])
else:
# file was modified
print "%s was updated to %s on %s" % (rel_path,
in_file_cksum['md5'],
time.ctime(in_file_cksum['stat'].st_mtime))
self.cksums[rel_path] = in_file_cksum
def _gc(self, seen):
'''
Remove unseen files from datfile
'''
for file in (set(self.cksums.keys()) - set(seen)):
print "%s was deleted" % file
del self.cksums[file]
def _printprogress(self, sofar, total):
if total > 0:
s = "\t%s/%s Files" % (sofar, total)
else:
s = "\t%s Files" % sofar
sys.stdout.write(s + " " * (78 - len(s)) + "\r")
sys.stdout.flush()
def main():
options = parseOptions()
pprint.pprint(options)
chksums = Treechksum(options,
options.data_file,
options.path)
chksums.compute(options)
if not options.dryrun: chksums.save()
if __name__ == '__main__':
main()