193 lines
5.4 KiB
Python
Executable file
193 lines
5.4 KiB
Python
Executable file
#! /usr/bin/env python3
|
|
# encoding: utf-8
|
|
|
|
# library imports
|
|
import datetime
|
|
import glob
|
|
import hashlib
|
|
import shutil
|
|
import re
|
|
import os
|
|
import over
|
|
|
|
# local imports
|
|
import version
|
|
|
|
# --------------------------------------------------
|
|
# Exceptions
|
|
|
|
class ConfigurationError(Exception):
|
|
pass
|
|
|
|
# --------------------------------------------------
|
|
# Functions
|
|
|
|
def walk(entry, callback, *args, **kwargs):
|
|
"""
|
|
For each file contained under the `entry` point,
|
|
calls callback(file, *args, **kwargs).
|
|
|
|
@while processing a directory tree
|
|
"""
|
|
|
|
entry = os.path.realpath(entry)
|
|
|
|
if os.path.isdir(entry):
|
|
for name in os.listdir(entry):
|
|
path = os.path.join(entry, name)
|
|
walk(path, callback, *args, **kwargs)
|
|
else:
|
|
callback(entry, *args, **kwargs)
|
|
|
|
# --------------------------------------------------
|
|
# Classes
|
|
|
|
class UniqueStore:
|
|
def __init__(self, path):
|
|
self.path = path
|
|
self.hasher = getattr(hashlib, "sha256")
|
|
self.log = over.text.Output("store")
|
|
|
|
self.count_new = 0
|
|
self.count_skipped = 0
|
|
self.count_total = 0
|
|
|
|
self.store = {} # size: {hash: path in store}
|
|
|
|
self.log("loading storage", self.log.tl.start)
|
|
|
|
for file in os.listdir(path):
|
|
self.add(os.path.join(path, file), preload=True)
|
|
self.count_total += 1
|
|
|
|
self.log("loaded %d blocks" %(self.count_total), self.log.tl.done)
|
|
|
|
def log_stats(self):
|
|
self.log("added %d, skipped %d, holding %d objects" %(self.count_new, self.count_skipped, self.count_total))
|
|
|
|
def add(self, path, preload=False, dt_from_name=False, verbose=False):
|
|
"""
|
|
Adds a unique file to the storage. Returns True iff the file was new, False otherwise.
|
|
|
|
@while adding a file to the hashed store
|
|
"""
|
|
|
|
stat = os.stat(path)
|
|
size = stat.st_size
|
|
filename_src = os.path.basename(path)
|
|
filename_src_name, filename_src_suffix = os.path.splitext(filename_src)
|
|
if "." in filename_src_suffix: filename_src_suffix = filename_src_suffix[1:]
|
|
|
|
if preload:
|
|
hash_hex = filename_src_name.split()[-1]
|
|
hash_raw = bytes(over.misc.hex_to_raw(hash_hex))
|
|
else:
|
|
hasher = self.hasher(open(path, "rb").read())
|
|
hash_raw = hasher.digest()
|
|
hash_hex = hasher.hexdigest()
|
|
|
|
if not len(hash_raw) == 32:
|
|
raise RuntimeError("%s: sha256 is not 256 bits long. This is bad." %(path))
|
|
|
|
is_new = False
|
|
|
|
if size in self.store:
|
|
suspects = self.store[size]
|
|
|
|
if hash_raw in suspects:
|
|
store_path = suspects[hash_raw]
|
|
if verbose:
|
|
self.log("%s (%s) is already known as %s" %(path, hash_hex, store_path), self.log.tl.warn)
|
|
else:
|
|
is_new = True
|
|
else:
|
|
is_new = True
|
|
|
|
if is_new:
|
|
mtime = None
|
|
|
|
if dt_from_name and not preload:
|
|
try:
|
|
dt_raw = re.findall("([12][0-9]{3}[01][0-9][0-3][0-9])[_\-:,. ]([012][0-9][0-5][0-9][0-5][0-9])", filename_src_name)[0]
|
|
dt_raw = " ".join(dt_raw)
|
|
mtime = datetime.datetime.strptime(dt_raw, "%Y%m%d %H%M%S")
|
|
except:
|
|
pass
|
|
|
|
if not mtime:
|
|
try:
|
|
mtime = datetime.datetime.fromtimestamp(stat.st_mtime)
|
|
except ValueError:
|
|
# fall back to ctime in case mtime is FUBAR
|
|
self.log("%s: mtime (~%i) is out of range, falling back to ctime (~%i)" %(
|
|
path,
|
|
1970 + stat.st_mtime // (365.24*86400),
|
|
1970 + stat.st_ctime // (365.24*86400)
|
|
), self.log.tl.warn)
|
|
|
|
mtime = datetime.datetime.fromtimestamp(stat.st_ctime)
|
|
|
|
if preload:
|
|
filename_str = filename_src
|
|
else:
|
|
filename_str = "%s %s %s.%s" %(
|
|
mtime.strftime("%Y-%m-%d %H:%M:%S"),
|
|
filename_src_name,
|
|
hash_hex,
|
|
filename_src_suffix
|
|
)
|
|
|
|
if size not in self.store:
|
|
self.store[size] = {}
|
|
|
|
self.store[size][hash_raw] = filename_str
|
|
|
|
if not preload:
|
|
shutil.copy(path, os.path.join(self.path, filename_str))
|
|
shutil.copystat(path, os.path.join(self.path, filename_str))
|
|
self.count_new += 1
|
|
self.count_total += 1
|
|
|
|
if verbose:
|
|
self.log("%s (%s) added" %(path, hash_hex), self.log.tl.done)
|
|
|
|
elif not preload:
|
|
self.count_skipped += 1
|
|
|
|
return is_new
|
|
|
|
# --------------------------------------------------
|
|
|
|
if __name__ == "__main__":
|
|
main = over.app.Main("hast", version.str, "AO-JSL")
|
|
main.add_option("store", "Directory to put unique files into.", str, count=1, abbr="s")
|
|
main.add_option("datetime-from-name", "Extract file datetime from its name if it matches IMG_YYYYmmdd_HHMMSS.", bool, [True], abbr="d")
|
|
main.add_option("stats", "Print stats before exiting.", bool, [True])
|
|
main.add_option("verbose", "Describe the action taken with each input file.", bool, [False], abbr="v")
|
|
main.add_doc("Description", ["Hashed storage - imports files from directory <m>targets<.> into the <W>--<g>output<.> flat directory, making sure these are unique. The files are all renamed to <c>mtime original-filename sha256.suffix<.>."])
|
|
main.setup()
|
|
|
|
if not main.cfg.store:
|
|
main.print("no <W>--<g>store<.> specified", main.print.tl.fail)
|
|
main.exit(1)
|
|
|
|
store_dir = os.path.realpath(main.cfg.store)
|
|
|
|
if not os.path.exists(store_dir):
|
|
main.print("creating <y>%s<.>" %(store_dir), main.print.tl.exec)
|
|
os.mkdir(store_dir)
|
|
|
|
if not os.path.isdir(store_dir):
|
|
main.print("<r>%s<.> is not a writable store directory" %(store_dir), main.print.tl.fail)
|
|
main.exit(1)
|
|
|
|
store = UniqueStore(store_dir)
|
|
|
|
for src in main.targets:
|
|
if main.cfg.verbose:
|
|
main.print("processing %s" %(src))
|
|
|
|
walk(src, store.add, verbose=main.cfg.verbose, dt_from_name=main.cfg.datetime_from_name)
|
|
|
|
if main.cfg.stats:
|
|
store.log_stats()
|