From 52b8cda64796f6e5c168b9c738db904c9a9ecb19 Mon Sep 17 00:00:00 2001 From: Martinez Date: Mon, 1 Jan 2018 18:07:56 +0100 Subject: [PATCH] renamed to HaSt --- dedup.py => hast.py | 87 ++++++++++++++++++++++++++++++++++----------- install.sh | 5 +-- version.py | 2 +- 3 files changed, 71 insertions(+), 23 deletions(-) rename dedup.py => hast.py (54%) diff --git a/dedup.py b/hast.py similarity index 54% rename from dedup.py rename to hast.py index dc2e371..c89247a 100755 --- a/dedup.py +++ b/hast.py @@ -6,6 +6,7 @@ import datetime import glob import hashlib import shutil +import re import os import over @@ -21,6 +22,23 @@ class ConfigurationError(Exception): # -------------------------------------------------- # Functions +def walk(entry, callback, *args, **kwargs): + """ + For each file contained under the `entry` point, + calls callback(file, *args, **kwargs). + + @while processing a directory tree + """ + + entry = os.path.realpath(entry) + + if os.path.isdir(entry): + for name in os.listdir(entry): + path = os.path.join(entry, name) + walk(path, callback, *args, **kwargs) + else: + callback(entry, *args, **kwargs) + # -------------------------------------------------- # Classes @@ -30,19 +48,24 @@ class UniqueStore: self.hasher = getattr(hashlib, algo) self.log = over.text.Output("store") + self.count_new = 0 + self.count_skipped = 0 + self.count_total = 0 self.store = {} # size: {hash: path in store} self.log("loading storage", self.log.tl.start) - i = 0 for file in os.listdir(path): - self.add(os.path.join(path, file), write_enabled=False) - i += 1 + self.add(os.path.join(path, file), preload=True) + self.count_total += 1 - self.log("loaded %d blocks" %(i), self.log.tl.done) + self.log("loaded %d blocks" %(self.count_total), self.log.tl.done) - def add(self, path, write_enabled=True, verbose=False): + def log_stats(self): + self.log("added %d, skipped %d, holding %d objects" %(self.count_new, self.count_skipped, self.count_total)) + + def add(self, path, preload=False, dt_from_name=False, verbose=False): """ Adds a unique file to the storage. Returns True iff the file was new, False otherwise. @@ -72,35 +95,59 @@ class UniqueStore: filename_src = os.path.basename(path) filename_src_name, filename_src_suffix = os.path.splitext(filename_src) if "." in filename_src_suffix: filename_src_suffix = filename_src_suffix[1:] - mtime = datetime.datetime.fromtimestamp(stat.st_mtime) - filename_str = "%s %s %s.%s" %( - filename_src_name, - mtime.strftime("%Y-%m-%d %H:%M:%S"), - hash_hr, - filename_src_suffix - ) + + mtime = None + + if dt_from_name: + try: + dt_raw = re.findall("([12][0-9]{3}[01][0-9][0-3][0-9])[_\-:,. ]([012][0-9][0-5][0-9][0-5][0-9])", filename_src_name)[0] + dt_raw = " ".join(dt_raw) + mtime = datetime.datetime.strptime(dt_raw, "%Y%m%d %H%M%S") + except: + pass + + if not mtime: + mtime = datetime.datetime.fromtimestamp(stat.st_mtime) + + if preload: + filename_str = filename_src + else: + filename_str = "%s %s %s.%s" %( + filename_src_name, + mtime.strftime("%Y-%m-%d %H:%M:%S"), + hash_hr, + filename_src_suffix + ) if size not in self.store: self.store[size] = {} self.store[size][hash_raw] = filename_str - if write_enabled: + if not preload: shutil.copy(path, os.path.join(self.path, filename_str)) + shutil.copystat(path, os.path.join(self.path, filename_str)) + self.count_new += 1 + self.count_total += 1 if verbose: self.log("%s (%s) added" %(path, hash_hr), self.log.tl.done) + elif not preload: + self.count_skipped += 1 + return is_new # -------------------------------------------------- if __name__ == "__main__": - main = over.app.Main("dedup", version.str, "AO-JSL") - main.add_option("store", "Directory to put unqie files into.", str, count=1, abbr="s") + main = over.app.Main("hast", version.str, "AO-JSL") + main.add_option("store", "Directory to put unique files into.", str, count=1, abbr="s") main.add_option("algo", "Hashing operation to use, use anything supported by hashlib. Either sha256<.> or sha512<.> is recommended, though. The latter is significantly faster on 64bit CPUs.", str, ["sha512"], count=1, abbr="a") + main.add_option("datetime-from-name", "Extract file datetime from its name if it matches IMG_YYYYmmdd_HHMMSS.", bool, [True], abbr="d") + main.add_option("stats", "Print stats before exiting.", bool, [True]) main.add_option("verbose", "Describe the action taken with each input file.", bool, [False], abbr="v") - main.add_doc("Description", ["Imports files from directory targets<.> into the --output<.> flat directory, making sure these are unique. The files are all renamed to original-filename mtime hash.suffix<.>."]) + main.add_doc("Description", ["Hashed storage - imports files from directory targets<.> into the --output<.> flat directory, making sure these are unique. The files are all renamed to original-filename mtime hash.suffix<.>."]) main.setup() if not main.cfg.store: @@ -123,7 +170,7 @@ if __name__ == "__main__": if main.cfg.verbose: main.print("processing %s" %(src)) - files = [f for f in glob.glob(os.path.join(os.path.realpath(src), "**")) if os.path.isfile(f)] - - for file in files: - store.add(file, verbose=main.cfg.verbose) + walk(src, store.add, verbose=main.cfg.verbose, dt_from_name=main.cfg.datetime_from_name) + + if main.cfg.stats: + store.log_stats() diff --git a/install.sh b/install.sh index 7a37abe..d0d0615 100755 --- a/install.sh +++ b/install.sh @@ -1,9 +1,10 @@ #! /bin/bash +NAME="hast" ROOT="${1}" -LIBDIR="${ROOT}/usr/lib/over/dedup" +LIBDIR="${ROOT}/usr/lib/over/${NAME}" BINDIR="${ROOT}/usr/bin" -BIN="dedup" +BIN="${NAME}" mkdir -p "${LIBDIR}" cp *.py "${LIBDIR}" diff --git a/version.py b/version.py index 5ed0a41..be49a4c 100644 --- a/version.py +++ b/version.py @@ -2,7 +2,7 @@ # encoding: utf-8 major = 0 # VERSION_MAJOR_IDENTIFIER -minor = 0 # VERSION_MINOR_IDENTIFIER +minor = 1 # VERSION_MINOR_IDENTIFIER # VERSION_LAST_MM 0.0 patch = 0 # VERSION_PATCH_IDENTIFIER str = ".".join(str(v) for v in (major, minor, patch))