#! /usr/bin/env python3 # encoding: utf-8 # library imports import datetime import glob import hashlib import shutil import os import over # local imports import version # -------------------------------------------------- # Exceptions class ConfigurationError(Exception): pass # -------------------------------------------------- # Functions # -------------------------------------------------- # Classes class UniqueStore: def __init__(self, path, algo): self.path = path self.hasher = getattr(hashlib, algo) self.log = over.text.Output("store") self.store = {} # size: {hash: path in store} self.log("loading storage", self.log.tl.start) i = 0 for file in os.listdir(path): self.add(os.path.join(path, file), write_enabled=False) i += 1 self.log("loaded %d blocks" %(i), self.log.tl.done) def add(self, path, write_enabled=True, verbose=False): """ Adds a unique file to the storage. Returns True iff the file was new, False otherwise. @while adding a file to the hashed store """ stat = os.stat(path) size = stat.st_size hasher = self.hasher(open(path, "rb").read()) hash_raw = hasher.digest() hash_hr = hasher.hexdigest()[:16] is_new = False if size in self.store: suspects = self.store[size] if hash_raw in suspects: store_path = suspects[hash_raw] if verbose: self.log("%s (%s) is already known as %s" %(path, hash_hr, store_path)) else: is_new = True else: is_new = True if is_new: filename_src = os.path.basename(path) filename_src_name, filename_src_suffix = os.path.splitext(filename_src) if "." in filename_src_suffix: filename_src_suffix = filename_src_suffix[1:] mtime = datetime.datetime.fromtimestamp(stat.st_mtime) filename_str = "%s %s %s.%s" %( filename_src_name, mtime.strftime("%Y-%m-%d %H:%M:%S"), hash_hr, filename_src_suffix ) if size not in self.store: self.store[size] = {} self.store[size][hash_raw] = filename_str if write_enabled: shutil.copy(path, os.path.join(self.path, filename_str)) if verbose: self.log("%s (%s) added" %(path, hash_hr), self.log.tl.done) return is_new # -------------------------------------------------- if __name__ == "__main__": main = over.app.Main("dedup", version.str, "AO-JSL") main.add_option("store", "Directory to put unqie files into.", str, count=1, abbr="s") main.add_option("algo", "Hashing operation to use, use anything supported by hashlib. Either sha256<.> or sha512<.> is recommended, though. The latter is significantly faster on 64bit CPUs.", str, ["sha512"], count=1, abbr="a") main.add_option("verbose", "Describe the action taken with each input file.", bool, [False], abbr="v") main.add_doc("Description", ["Imports files from directory targets<.> into the --output<.> flat directory, making sure these are unique. The files are all renamed to original-filename mtime hash.suffix<.>."]) main.setup() if not main.cfg.store: main.print("no --store<.> specified", main.print.tl.fail) main.exit(1) store_dir = os.path.realpath(main.cfg.store) if not os.path.exists(store_dir): main.print("creating %s<.>" %(store_dir), main.print.tl.exec) os.mkdir(store_dir) if not os.path.isdir(store_dir): main.print("%s<.> is not a writable store directory" %(store_dir), main.print.tl.fail) main.exit(1) store = UniqueStore(store_dir, main.cfg.algo) for src in main.targets: if main.cfg.verbose: main.print("processing %s" %(src)) files = [f for f in glob.glob(os.path.join(os.path.realpath(src), "**")) if os.path.isfile(f)] for file in files: store.add(file, verbose=main.cfg.verbose)