-
Notifications
You must be signed in to change notification settings - Fork 86
Allow bagging to destination #138
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
b9e2d1f
838e121
eae9a0b
7fd6e42
d3a0824
eb0f093
2b8f3cb
75036b1
8ff5ad5
61769cd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,6 +12,7 @@ | |
import os | ||
import re | ||
import signal | ||
import shutil | ||
import sys | ||
import tempfile | ||
import unicodedata | ||
|
@@ -142,7 +143,7 @@ def find_locale_dir(): | |
|
||
|
||
def make_bag( | ||
bag_dir, bag_info=None, processes=1, checksums=None, checksum=None, encoding="utf-8" | ||
bag_dir, bag_info=None, processes=1, checksums=None, checksum=None, encoding="utf-8", dest_dir=None | ||
): | ||
""" | ||
Convert a given directory into a bag. You can pass in arbitrary | ||
|
@@ -162,30 +163,41 @@ def make_bag( | |
if checksums is None: | ||
checksums = DEFAULT_CHECKSUMS | ||
|
||
bag_dir = os.path.abspath(bag_dir) | ||
if dest_dir: | ||
bag_name = os.path.basename(bag_dir) | ||
dest_dir = os.path.abspath(os.path.join(dest_dir, bag_name)) | ||
if not os.path.isdir(dest_dir): | ||
os.makedirs(dest_dir) | ||
else: | ||
raise RuntimeError(_("The following directory already exists:\n%s"), dest_dir) | ||
else: | ||
dest_dir = os.path.abspath(bag_dir) | ||
|
||
source_dir = os.path.abspath(bag_dir) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Consider using There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I use it on line 233 to address an issue with getcwd, but didn't want to break too far from existing practice without more consideration. I don't think it would be an issue. Are there any edge cases that would be good to test for? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I couldn't think of any edge cases, but I did have to invoke realpath in the setup class for the tests. |
||
|
||
cwd = os.path.abspath(os.path.curdir) | ||
|
||
if cwd.startswith(bag_dir) and cwd != bag_dir: | ||
if cwd.startswith(source_dir) and cwd != source_dir: | ||
raise RuntimeError( | ||
_("Bagging a parent of the current directory is not supported") | ||
) | ||
|
||
LOGGER.info(_("Creating bag for directory %s"), bag_dir) | ||
|
||
if not os.path.isdir(bag_dir): | ||
LOGGER.error(_("Bag directory %s does not exist"), bag_dir) | ||
raise RuntimeError(_("Bag directory %s does not exist") % bag_dir) | ||
LOGGER.info(_("Creating bag from directory %s"), source_dir) | ||
|
||
if not os.path.isdir(source_dir): | ||
LOGGER.error(_("Bag source directory %s does not exist"), bag_dir) | ||
raise RuntimeError(_("Bag source directory %s does not exist") % bag_dir) | ||
|
||
# FIXME: we should do the permissions checks before changing directories | ||
old_dir = os.path.abspath(os.path.curdir) | ||
|
||
try: | ||
try: | ||
# TODO: These two checks are currently redundant since an unreadable directory will also | ||
# often be unwritable, and this code will require review when we add the option to | ||
# bag to a destination other than the source. It would be nice if we could avoid | ||
# walking the directory tree more than once even if most filesystems will cache it | ||
|
||
unbaggable = _can_bag(bag_dir) | ||
unbaggable = _can_bag(dest_dir) | ||
|
||
if unbaggable: | ||
LOGGER.error( | ||
|
@@ -194,7 +206,7 @@ def make_bag( | |
) | ||
raise BagError(_("Missing permissions to move all files and directories")) | ||
|
||
unreadable_dirs, unreadable_files = _can_read(bag_dir) | ||
unreadable_dirs, unreadable_files = _can_read(source_dir) | ||
|
||
if unreadable_dirs or unreadable_files: | ||
if unreadable_dirs: | ||
|
@@ -214,33 +226,57 @@ def make_bag( | |
LOGGER.info(_("Creating data directory")) | ||
|
||
# FIXME: if we calculate full paths we won't need to deal with changing directories | ||
os.chdir(bag_dir) | ||
os.chdir(source_dir) | ||
cwd = os.getcwd() | ||
temp_data = tempfile.mkdtemp(dir=cwd) | ||
|
||
for f in os.listdir("."): | ||
if os.path.abspath(f) == temp_data: | ||
continue | ||
new_f = os.path.join(temp_data, f) | ||
LOGGER.info( | ||
_("Moving %(source)s to %(destination)s"), | ||
{"source": f, "destination": new_f}, | ||
) | ||
os.rename(f, new_f) | ||
temp_data = tempfile.mkdtemp(dir=dest_dir) | ||
# getcwd resolves symlinks, dest_dir used abspath, which doesn't | ||
temp_data = os.path.realpath(temp_data) | ||
|
||
if source_dir == dest_dir: | ||
for f in os.listdir("."): | ||
if os.path.abspath(f) == temp_data: | ||
continue | ||
new_f = os.path.join(temp_data, f) | ||
LOGGER.info( | ||
_("Moving %(source)s to %(destination)s"), | ||
{"source": f, "destination": new_f}, | ||
) | ||
os.rename(f, new_f) | ||
else: | ||
for f in os.listdir("."): | ||
new_f = os.path.join(temp_data, f) | ||
LOGGER.info( | ||
_("Copying %(source)s to %(destination)s"), | ||
{"source": f, "destination": new_f}, | ||
) | ||
if os.path.isdir(f): | ||
shutil.copytree(f, new_f) | ||
else: | ||
shutil.copy(f, new_f) | ||
|
||
LOGGER.info( | ||
_("Moving %(source)s to %(destination)s"), | ||
{"source": temp_data, "destination": "data"}, | ||
) | ||
os.rename(temp_data, "data") | ||
|
||
os.rename(temp_data, os.path.join(dest_dir, "data")) | ||
|
||
# permissions for the payload directory should match those of the | ||
# original directory | ||
os.chmod("data", os.stat(cwd).st_mode) | ||
|
||
total_bytes, total_files = make_manifests( | ||
"data", processes, algorithms=checksums, encoding=encoding | ||
) | ||
os.chmod(os.path.join(dest_dir, "data"), os.stat(cwd).st_mode) | ||
|
||
if source_dir == dest_dir: | ||
total_bytes, total_files = make_manifests( | ||
"data", processes, algorithms=checksums, encoding=encoding | ||
) | ||
else: | ||
total_bytes, total_files = make_manifests( | ||
".", processes, algorithms=checksums, encoding=encoding, dest_dir=dest_dir, rel_path="data" | ||
) | ||
|
||
os.chdir(dest_dir) | ||
cwd = os.getcwd() | ||
|
||
LOGGER.info(_("Creating bagit.txt")) | ||
txt = """BagIt-Version: 0.97\nTag-File-Character-Encoding: UTF-8\n""" | ||
|
@@ -264,14 +300,14 @@ def make_bag( | |
_make_tag_file("bag-info.txt", bag_info) | ||
|
||
for c in checksums: | ||
_make_tagmanifest_file(c, bag_dir, encoding="utf-8") | ||
_make_tagmanifest_file(c, dest_dir, encoding="utf-8") | ||
except Exception: | ||
LOGGER.exception(_("An error occurred creating a bag in %s"), bag_dir) | ||
raise | ||
finally: | ||
os.chdir(old_dir) | ||
|
||
return Bag(bag_dir) | ||
return Bag(dest_dir) | ||
|
||
|
||
class Bag(object): | ||
|
@@ -1237,13 +1273,18 @@ def _make_tag_file(bag_info_path, bag_info): | |
f.write("%s: %s\n" % (h, txt)) | ||
|
||
|
||
def make_manifests(data_dir, processes, algorithms=DEFAULT_CHECKSUMS, encoding="utf-8"): | ||
def make_manifests(data_dir, processes, algorithms=DEFAULT_CHECKSUMS, encoding="utf-8", dest_dir=None, rel_path=None): | ||
LOGGER.info( | ||
_("Using %(process_count)d processes to generate manifests: %(algorithms)s"), | ||
{"process_count": processes, "algorithms": ", ".join(algorithms)}, | ||
) | ||
|
||
manifest_line_generator = partial(generate_manifest_lines, algorithms=algorithms) | ||
if not dest_dir: | ||
dest_dir = os.getcwd() | ||
|
||
data_dir = os.path.relpath(data_dir) | ||
|
||
manifest_line_generator = partial(generate_manifest_lines, algorithms=algorithms, rel_path=rel_path) | ||
|
||
if processes > 1: | ||
pool = multiprocessing.Pool(processes=processes) | ||
|
@@ -1266,8 +1307,9 @@ def make_manifests(data_dir, processes, algorithms=DEFAULT_CHECKSUMS, encoding=" | |
|
||
for algorithm, values in manifest_data.items(): | ||
manifest_filename = "manifest-%s.txt" % algorithm | ||
manifest_path = os.path.join(dest_dir, manifest_filename) | ||
|
||
with open_text_file(manifest_filename, "w", encoding=encoding) as manifest: | ||
with open_text_file(manifest_path, "w", encoding=encoding) as manifest: | ||
for digest, filename, byte_count in values: | ||
manifest.write("%s %s\n" % (digest, _encode_filename(filename))) | ||
num_files[algorithm] += 1 | ||
|
@@ -1386,7 +1428,7 @@ def _can_read(test_dir): | |
return (tuple(unreadable_dirs), tuple(unreadable_files)) | ||
|
||
|
||
def generate_manifest_lines(filename, algorithms=DEFAULT_CHECKSUMS): | ||
def generate_manifest_lines(filename, algorithms=DEFAULT_CHECKSUMS, rel_path=None): | ||
LOGGER.info(_("Generating manifest lines for file %s"), filename) | ||
|
||
# For performance we'll read the file only once and pass it block | ||
|
@@ -1408,6 +1450,9 @@ def generate_manifest_lines(filename, algorithms=DEFAULT_CHECKSUMS): | |
|
||
decoded_filename = _decode_filename(filename) | ||
|
||
if rel_path: | ||
decoded_filename = os.path.join(rel_path, decoded_filename) | ||
|
||
# We'll generate a list of results in roughly manifest format but prefixed with the algorithm: | ||
results = [ | ||
(alg, hasher.hexdigest(), decoded_filename, total_bytes) | ||
|
@@ -1513,6 +1558,15 @@ def _make_parser(): | |
) | ||
% ", ".join(DEFAULT_CHECKSUMS), | ||
) | ||
parser.add_argument( | ||
"--destination", | ||
type=str, | ||
dest="dest_dir", | ||
default=None, | ||
help=_( | ||
"Create bag in destination directory rather than in place." | ||
), | ||
) | ||
|
||
for i in CHECKSUM_ALGOS: | ||
alg_name = re.sub(r"^([A-Z]+)(\d+)$", r"\1-\2", i.upper()) | ||
|
@@ -1601,13 +1655,20 @@ def main(): | |
bag_info=args.bag_info, | ||
processes=args.processes, | ||
checksums=args.checksums, | ||
dest_dir=args.destination | ||
) | ||
except Exception as exc: | ||
LOGGER.error( | ||
_("Failed to create bag in %(bag_directory)s: %(error)s"), | ||
{"bag_directory": bag_dir, "error": exc}, | ||
exc_info=True, | ||
) | ||
if args.dest_dir: | ||
LOGGER.error(_("Failed to create bag in %(bag_directory)s: %(error)s"), | ||
{'bag_directory': args.dest_dir, 'error': exc}, | ||
exc_info=True, | ||
) | ||
else: | ||
LOGGER.error( | ||
_("Failed to create bag in %(bag_directory)s: %(error)s"), | ||
{"bag_directory": bag_dir, "error": exc}, | ||
exc_info=True, | ||
) | ||
rc = 1 | ||
|
||
sys.exit(rc) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Consider allowing an empty directory as a target, useful e.g. when creating that directory requires different permissions than the user running bagit has.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good point. Would checking for an empty directory like this be enough?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That will work because
os.listdir
is documented as not including.
or..
but I was wondering about performance if someone points it at a directory which a large number of files — the classic Unix answer beingos.stat(dest_dir).st_nlink > 2
— but I don't think that'll really be an issue in this scenario.