removed --algo, forced sha256, preloading is much faster now (seconds instead of hours)

This commit is contained in:
Martinez 2018-01-01 19:47:20 +01:00
parent 52b8cda647
commit 59b9ade124

41
hast.py
View file

@ -43,9 +43,9 @@ def walk(entry, callback, *args, **kwargs):
# Classes # Classes
class UniqueStore: class UniqueStore:
def __init__(self, path, algo): def __init__(self, path):
self.path = path self.path = path
self.hasher = getattr(hashlib, algo) self.hasher = getattr(hashlib, "sha256")
self.log = over.text.Output("store") self.log = over.text.Output("store")
self.count_new = 0 self.count_new = 0
@ -74,9 +74,20 @@ class UniqueStore:
stat = os.stat(path) stat = os.stat(path)
size = stat.st_size size = stat.st_size
hasher = self.hasher(open(path, "rb").read()) filename_src = os.path.basename(path)
hash_raw = hasher.digest() filename_src_name, filename_src_suffix = os.path.splitext(filename_src)
hash_hr = hasher.hexdigest()[:16] if "." in filename_src_suffix: filename_src_suffix = filename_src_suffix[1:]
if preload:
hash_hex = filename_src_name.split()[-1]
hash_raw = bytes(over.misc.hex_to_raw(hash_hex))
else:
hasher = self.hasher(open(path, "rb").read())
hash_raw = hasher.digest()
hash_hex = hasher.hexdigest()
if not len(hash_raw) == 32:
raise RuntimeError("%s: sha256 is not 256 bits long. This is bad." %(path))
is_new = False is_new = False
@ -85,20 +96,17 @@ class UniqueStore:
if hash_raw in suspects: if hash_raw in suspects:
store_path = suspects[hash_raw] store_path = suspects[hash_raw]
if verbose: self.log("%s (%s) is already known as %s" %(path, hash_hr, store_path)) if verbose:
self.log("%s (%s) is already known as %s" %(path, hash_hex, store_path), self.log.tl.warn)
else: else:
is_new = True is_new = True
else: else:
is_new = True is_new = True
if is_new: if is_new:
filename_src = os.path.basename(path)
filename_src_name, filename_src_suffix = os.path.splitext(filename_src)
if "." in filename_src_suffix: filename_src_suffix = filename_src_suffix[1:]
mtime = None mtime = None
if dt_from_name: if dt_from_name and not preload:
try: try:
dt_raw = re.findall("([12][0-9]{3}[01][0-9][0-3][0-9])[_\-:,. ]([012][0-9][0-5][0-9][0-5][0-9])", filename_src_name)[0] dt_raw = re.findall("([12][0-9]{3}[01][0-9][0-3][0-9])[_\-:,. ]([012][0-9][0-5][0-9][0-5][0-9])", filename_src_name)[0]
dt_raw = " ".join(dt_raw) dt_raw = " ".join(dt_raw)
@ -113,9 +121,9 @@ class UniqueStore:
filename_str = filename_src filename_str = filename_src
else: else:
filename_str = "%s %s %s.%s" %( filename_str = "%s %s %s.%s" %(
filename_src_name,
mtime.strftime("%Y-%m-%d %H:%M:%S"), mtime.strftime("%Y-%m-%d %H:%M:%S"),
hash_hr, filename_src_name,
hash_hex,
filename_src_suffix filename_src_suffix
) )
@ -131,7 +139,7 @@ class UniqueStore:
self.count_total += 1 self.count_total += 1
if verbose: if verbose:
self.log("%s (%s) added" %(path, hash_hr), self.log.tl.done) self.log("%s (%s) added" %(path, hash_hex), self.log.tl.done)
elif not preload: elif not preload:
self.count_skipped += 1 self.count_skipped += 1
@ -143,11 +151,10 @@ class UniqueStore:
if __name__ == "__main__": if __name__ == "__main__":
main = over.app.Main("hast", version.str, "AO-JSL") main = over.app.Main("hast", version.str, "AO-JSL")
main.add_option("store", "Directory to put unique files into.", str, count=1, abbr="s") main.add_option("store", "Directory to put unique files into.", str, count=1, abbr="s")
main.add_option("algo", "Hashing operation to use, use anything supported by hashlib. Either <M>sha256<.> or <M>sha512<.> is recommended, though. The latter is significantly faster on 64bit CPUs.", str, ["sha512"], count=1, abbr="a")
main.add_option("datetime-from-name", "Extract file datetime from its name if it matches IMG_YYYYmmdd_HHMMSS.", bool, [True], abbr="d") main.add_option("datetime-from-name", "Extract file datetime from its name if it matches IMG_YYYYmmdd_HHMMSS.", bool, [True], abbr="d")
main.add_option("stats", "Print stats before exiting.", bool, [True]) main.add_option("stats", "Print stats before exiting.", bool, [True])
main.add_option("verbose", "Describe the action taken with each input file.", bool, [False], abbr="v") main.add_option("verbose", "Describe the action taken with each input file.", bool, [False], abbr="v")
main.add_doc("Description", ["Hashed storage - imports files from directory <m>targets<.> into the <W>--<g>output<.> flat directory, making sure these are unique. The files are all renamed to <c>original-filename mtime hash.suffix<.>."]) main.add_doc("Description", ["Hashed storage - imports files from directory <m>targets<.> into the <W>--<g>output<.> flat directory, making sure these are unique. The files are all renamed to <c>mtime original-filename sha256.suffix<.>."])
main.setup() main.setup()
if not main.cfg.store: if not main.cfg.store:
@ -164,7 +171,7 @@ if __name__ == "__main__":
main.print("<r>%s<.> is not a writable store directory" %(store_dir), main.print.tl.fail) main.print("<r>%s<.> is not a writable store directory" %(store_dir), main.print.tl.fail)
main.exit(1) main.exit(1)
store = UniqueStore(store_dir, main.cfg.algo) store = UniqueStore(store_dir)
for src in main.targets: for src in main.targets:
if main.cfg.verbose: if main.cfg.verbose: