From 7e93ffe71f89e20836b2eab1d33de8884cad1788 Mon Sep 17 00:00:00 2001 From: Martinez Date: Sun, 12 Nov 2017 22:12:09 +0100 Subject: [PATCH] import --- .gitignore | 2 + dedup.py | 129 +++++++++++++++++++++++++++++++++++++++++++++++++++++ install.sh | 11 +++++ version.py | 8 ++++ 4 files changed, 150 insertions(+) create mode 100644 .gitignore create mode 100755 dedup.py create mode 100755 install.sh create mode 100644 version.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..653e6f1 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +over +__pycache__ diff --git a/dedup.py b/dedup.py new file mode 100755 index 0000000..dc2e371 --- /dev/null +++ b/dedup.py @@ -0,0 +1,129 @@ +#! /usr/bin/env python3 +# encoding: utf-8 + +# library imports +import datetime +import glob +import hashlib +import shutil +import os +import over + +# local imports +import version + +# -------------------------------------------------- +# Exceptions + +class ConfigurationError(Exception): + pass + +# -------------------------------------------------- +# Functions + +# -------------------------------------------------- +# Classes + +class UniqueStore: + def __init__(self, path, algo): + self.path = path + self.hasher = getattr(hashlib, algo) + self.log = over.text.Output("store") + + + self.store = {} # size: {hash: path in store} + + self.log("loading storage", self.log.tl.start) + + i = 0 + for file in os.listdir(path): + self.add(os.path.join(path, file), write_enabled=False) + i += 1 + + self.log("loaded %d blocks" %(i), self.log.tl.done) + + def add(self, path, write_enabled=True, verbose=False): + """ + Adds a unique file to the storage. Returns True iff the file was new, False otherwise. + + @while adding a file to the hashed store + """ + + stat = os.stat(path) + size = stat.st_size + hasher = self.hasher(open(path, "rb").read()) + hash_raw = hasher.digest() + hash_hr = hasher.hexdigest()[:16] + + is_new = False + + if size in self.store: + suspects = self.store[size] + + if hash_raw in suspects: + store_path = suspects[hash_raw] + if verbose: self.log("%s (%s) is already known as %s" %(path, hash_hr, store_path)) + else: + is_new = True + else: + is_new = True + + if is_new: + filename_src = os.path.basename(path) + filename_src_name, filename_src_suffix = os.path.splitext(filename_src) + if "." in filename_src_suffix: filename_src_suffix = filename_src_suffix[1:] + mtime = datetime.datetime.fromtimestamp(stat.st_mtime) + filename_str = "%s %s %s.%s" %( + filename_src_name, + mtime.strftime("%Y-%m-%d %H:%M:%S"), + hash_hr, + filename_src_suffix + ) + + if size not in self.store: + self.store[size] = {} + + self.store[size][hash_raw] = filename_str + + if write_enabled: + shutil.copy(path, os.path.join(self.path, filename_str)) + + if verbose: + self.log("%s (%s) added" %(path, hash_hr), self.log.tl.done) + + return is_new + +# -------------------------------------------------- + +if __name__ == "__main__": + main = over.app.Main("dedup", version.str, "AO-JSL") + main.add_option("store", "Directory to put unqie files into.", str, count=1, abbr="s") + main.add_option("algo", "Hashing operation to use, use anything supported by hashlib. Either sha256<.> or sha512<.> is recommended, though. The latter is significantly faster on 64bit CPUs.", str, ["sha512"], count=1, abbr="a") + main.add_option("verbose", "Describe the action taken with each input file.", bool, [False], abbr="v") + main.add_doc("Description", ["Imports files from directory targets<.> into the --output<.> flat directory, making sure these are unique. The files are all renamed to original-filename mtime hash.suffix<.>."]) + main.setup() + + if not main.cfg.store: + main.print("no --store<.> specified", main.print.tl.fail) + main.exit(1) + + store_dir = os.path.realpath(main.cfg.store) + + if not os.path.exists(store_dir): + main.print("creating %s<.>" %(store_dir), main.print.tl.exec) + os.mkdir(store_dir) + + if not os.path.isdir(store_dir): + main.print("%s<.> is not a writable store directory" %(store_dir), main.print.tl.fail) + main.exit(1) + + store = UniqueStore(store_dir, main.cfg.algo) + + for src in main.targets: + if main.cfg.verbose: + main.print("processing %s" %(src)) + + files = [f for f in glob.glob(os.path.join(os.path.realpath(src), "**")) if os.path.isfile(f)] + + for file in files: + store.add(file, verbose=main.cfg.verbose) diff --git a/install.sh b/install.sh new file mode 100755 index 0000000..7a37abe --- /dev/null +++ b/install.sh @@ -0,0 +1,11 @@ +#! /bin/bash + +ROOT="${1}" +LIBDIR="${ROOT}/usr/lib/over/dedup" +BINDIR="${ROOT}/usr/bin" +BIN="dedup" + +mkdir -p "${LIBDIR}" +cp *.py "${LIBDIR}" +mkdir -p "${BINDIR}" +ln -s "${LIBDIR}/${BIN}.py" "${BINDIR}/${BIN}" diff --git a/version.py b/version.py new file mode 100644 index 0000000..5ed0a41 --- /dev/null +++ b/version.py @@ -0,0 +1,8 @@ +#! /usr/bin/env python3 +# encoding: utf-8 + +major = 0 # VERSION_MAJOR_IDENTIFIER +minor = 0 # VERSION_MINOR_IDENTIFIER +# VERSION_LAST_MM 0.0 +patch = 0 # VERSION_PATCH_IDENTIFIER +str = ".".join(str(v) for v in (major, minor, patch))