#! /usr/bin/env python3 # encoding: utf-8 # library imports import datetime import glob import hashlib import shutil import re import os import over # local imports import version # -------------------------------------------------- # Exceptions class ConfigurationError(Exception): pass # -------------------------------------------------- # Functions def walk(entry, callback, *args, **kwargs): """ For each file contained under the `entry` point, calls callback(file, *args, **kwargs). @while processing a directory tree """ entry = os.path.realpath(entry) if os.path.isdir(entry): for name in os.listdir(entry): path = os.path.join(entry, name) walk(path, callback, *args, **kwargs) else: callback(entry, *args, **kwargs) # -------------------------------------------------- # Classes class UniqueStore: def __init__(self, path): self.path = path self.hasher = getattr(hashlib, "sha256") self.log = over.text.Output("store") self.count_new = 0 self.count_skipped = 0 self.count_total = 0 self.store = {} # size: {hash: path in store} self.log("loading storage", self.log.tl.start) for file in os.listdir(path): self.add(os.path.join(path, file), preload=True) self.count_total += 1 self.log("loaded %d blocks" %(self.count_total), self.log.tl.done) def log_stats(self): self.log("added %d, skipped %d, holding %d objects" %(self.count_new, self.count_skipped, self.count_total)) def add(self, path, preload=False, dt_from_name=False, verbose=False): """ Adds a unique file to the storage. Returns True iff the file was new, False otherwise. @while adding a file to the hashed store """ stat = os.stat(path) size = stat.st_size filename_src = os.path.basename(path) filename_src_name, filename_src_suffix = os.path.splitext(filename_src) if "." in filename_src_suffix: filename_src_suffix = filename_src_suffix[1:] if preload: hash_hex = filename_src_name.split()[-1] hash_raw = bytes(over.misc.hex_to_raw(hash_hex)) else: hasher = self.hasher(open(path, "rb").read()) hash_raw = hasher.digest() hash_hex = hasher.hexdigest() if not len(hash_raw) == 32: raise RuntimeError("%s: sha256 is not 256 bits long. This is bad." %(path)) is_new = False if size in self.store: suspects = self.store[size] if hash_raw in suspects: store_path = suspects[hash_raw] if verbose: self.log("%s (%s) is already known as %s" %(path, hash_hex, store_path), self.log.tl.warn) else: is_new = True else: is_new = True if is_new: mtime = None if dt_from_name and not preload: try: dt_raw = re.findall("([12][0-9]{3}[01][0-9][0-3][0-9])[_\-:,. ]([012][0-9][0-5][0-9][0-5][0-9])", filename_src_name)[0] dt_raw = " ".join(dt_raw) mtime = datetime.datetime.strptime(dt_raw, "%Y%m%d %H%M%S") except: pass if not mtime: try: mtime = datetime.datetime.fromtimestamp(stat.st_mtime) except ValueError: # fall back to ctime in case mtime is FUBAR self.log("%s: mtime (~%i) is out of range, falling back to ctime (~%i)" %( path, 1970 + stat.st_mtime // (365.24*86400), 1970 + stat.st_ctime // (365.24*86400) ), self.log.tl.warn) mtime = datetime.datetime.fromtimestamp(stat.st_ctime) if preload: filename_str = filename_src else: filename_str = "%s %s %s.%s" %( mtime.strftime("%Y-%m-%d %H:%M:%S"), filename_src_name, hash_hex, filename_src_suffix ) if size not in self.store: self.store[size] = {} self.store[size][hash_raw] = filename_str if not preload: shutil.copy(path, os.path.join(self.path, filename_str)) shutil.copystat(path, os.path.join(self.path, filename_str)) self.count_new += 1 self.count_total += 1 if verbose: self.log("%s (%s) added" %(path, hash_hex), self.log.tl.done) elif not preload: self.count_skipped += 1 return is_new # -------------------------------------------------- if __name__ == "__main__": main = over.app.Main("hast", version.str, "AO-JSL") main.add_option("store", "Directory to put unique files into.", str, count=1, abbr="s") main.add_option("datetime-from-name", "Extract file datetime from its name if it matches IMG_YYYYmmdd_HHMMSS.", bool, [True], abbr="d") main.add_option("stats", "Print stats before exiting.", bool, [True]) main.add_option("verbose", "Describe the action taken with each input file.", bool, [False], abbr="v") main.add_doc("Description", ["Hashed storage - imports files from directory targets<.> into the --output<.> flat directory, making sure these are unique. The files are all renamed to mtime original-filename sha256.suffix<.>."]) main.setup() if not main.cfg.store: main.print("no --store<.> specified", main.print.tl.fail) main.exit(1) store_dir = os.path.realpath(main.cfg.store) if not os.path.exists(store_dir): main.print("creating %s<.>" %(store_dir), main.print.tl.exec) os.mkdir(store_dir) if not os.path.isdir(store_dir): main.print("%s<.> is not a writable store directory" %(store_dir), main.print.tl.fail) main.exit(1) store = UniqueStore(store_dir) for src in main.targets: if main.cfg.verbose: main.print("processing %s" %(src)) walk(src, store.add, verbose=main.cfg.verbose, dt_from_name=main.cfg.datetime_from_name) if main.cfg.stats: store.log_stats()