diff --git a/hast.py b/hast.py index c89247a..2c4d6ec 100755 --- a/hast.py +++ b/hast.py @@ -43,9 +43,9 @@ def walk(entry, callback, *args, **kwargs): # Classes class UniqueStore: - def __init__(self, path, algo): + def __init__(self, path): self.path = path - self.hasher = getattr(hashlib, algo) + self.hasher = getattr(hashlib, "sha256") self.log = over.text.Output("store") self.count_new = 0 @@ -74,9 +74,20 @@ class UniqueStore: stat = os.stat(path) size = stat.st_size - hasher = self.hasher(open(path, "rb").read()) - hash_raw = hasher.digest() - hash_hr = hasher.hexdigest()[:16] + filename_src = os.path.basename(path) + filename_src_name, filename_src_suffix = os.path.splitext(filename_src) + if "." in filename_src_suffix: filename_src_suffix = filename_src_suffix[1:] + + if preload: + hash_hex = filename_src_name.split()[-1] + hash_raw = bytes(over.misc.hex_to_raw(hash_hex)) + else: + hasher = self.hasher(open(path, "rb").read()) + hash_raw = hasher.digest() + hash_hex = hasher.hexdigest() + + if not len(hash_raw) == 32: + raise RuntimeError("%s: sha256 is not 256 bits long. This is bad." %(path)) is_new = False @@ -85,20 +96,17 @@ class UniqueStore: if hash_raw in suspects: store_path = suspects[hash_raw] - if verbose: self.log("%s (%s) is already known as %s" %(path, hash_hr, store_path)) + if verbose: + self.log("%s (%s) is already known as %s" %(path, hash_hex, store_path), self.log.tl.warn) else: is_new = True else: is_new = True if is_new: - filename_src = os.path.basename(path) - filename_src_name, filename_src_suffix = os.path.splitext(filename_src) - if "." in filename_src_suffix: filename_src_suffix = filename_src_suffix[1:] - mtime = None - if dt_from_name: + if dt_from_name and not preload: try: dt_raw = re.findall("([12][0-9]{3}[01][0-9][0-3][0-9])[_\-:,. ]([012][0-9][0-5][0-9][0-5][0-9])", filename_src_name)[0] dt_raw = " ".join(dt_raw) @@ -113,9 +121,9 @@ class UniqueStore: filename_str = filename_src else: filename_str = "%s %s %s.%s" %( - filename_src_name, mtime.strftime("%Y-%m-%d %H:%M:%S"), - hash_hr, + filename_src_name, + hash_hex, filename_src_suffix ) @@ -131,7 +139,7 @@ class UniqueStore: self.count_total += 1 if verbose: - self.log("%s (%s) added" %(path, hash_hr), self.log.tl.done) + self.log("%s (%s) added" %(path, hash_hex), self.log.tl.done) elif not preload: self.count_skipped += 1 @@ -143,11 +151,10 @@ class UniqueStore: if __name__ == "__main__": main = over.app.Main("hast", version.str, "AO-JSL") main.add_option("store", "Directory to put unique files into.", str, count=1, abbr="s") - main.add_option("algo", "Hashing operation to use, use anything supported by hashlib. Either sha256<.> or sha512<.> is recommended, though. The latter is significantly faster on 64bit CPUs.", str, ["sha512"], count=1, abbr="a") main.add_option("datetime-from-name", "Extract file datetime from its name if it matches IMG_YYYYmmdd_HHMMSS.", bool, [True], abbr="d") main.add_option("stats", "Print stats before exiting.", bool, [True]) main.add_option("verbose", "Describe the action taken with each input file.", bool, [False], abbr="v") - main.add_doc("Description", ["Hashed storage - imports files from directory targets<.> into the --output<.> flat directory, making sure these are unique. The files are all renamed to original-filename mtime hash.suffix<.>."]) + main.add_doc("Description", ["Hashed storage - imports files from directory targets<.> into the --output<.> flat directory, making sure these are unique. The files are all renamed to mtime original-filename sha256.suffix<.>."]) main.setup() if not main.cfg.store: @@ -164,7 +171,7 @@ if __name__ == "__main__": main.print("%s<.> is not a writable store directory" %(store_dir), main.print.tl.fail) main.exit(1) - store = UniqueStore(store_dir, main.cfg.algo) + store = UniqueStore(store_dir) for src in main.targets: if main.cfg.verbose: