Skip to content

Allow bagging to destination #138

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 10 commits into from
139 changes: 100 additions & 39 deletions bagit.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import os
import re
import signal
import shutil
import sys
import tempfile
import unicodedata
Expand Down Expand Up @@ -142,7 +143,7 @@ def find_locale_dir():


def make_bag(
bag_dir, bag_info=None, processes=1, checksums=None, checksum=None, encoding="utf-8"
bag_dir, bag_info=None, processes=1, checksums=None, checksum=None, encoding="utf-8", dest_dir=None
):
"""
Convert a given directory into a bag. You can pass in arbitrary
Expand All @@ -162,30 +163,41 @@ def make_bag(
if checksums is None:
checksums = DEFAULT_CHECKSUMS

bag_dir = os.path.abspath(bag_dir)
if dest_dir:
bag_name = os.path.basename(bag_dir)
dest_dir = os.path.abspath(os.path.join(dest_dir, bag_name))
if not os.path.isdir(dest_dir):
os.makedirs(dest_dir)
else:
raise RuntimeError(_("The following directory already exists:\n%s"), dest_dir)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider allowing an empty directory as a target, useful e.g. when creating that directory requires different permissions than the user running bagit has.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point. Would checking for an empty directory like this be enough?

Suggested change
raise RuntimeError(_("The following directory already exists:\n%s"), dest_dir)
elif len(os.listdir(dest_dir)) > 0:
raise RuntimeError(_("The following directory already exists and contains files:\n%s"), dest_dir)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That will work because os.listdir is documented as not including . or .. but I was wondering about performance if someone points it at a directory which a large number of files — the classic Unix answer being os.stat(dest_dir).st_nlink > 2 — but I don't think that'll really be an issue in this scenario.

else:
dest_dir = os.path.abspath(bag_dir)

source_dir = os.path.abspath(bag_dir)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider using realpath instead to resolve symlinks (not related to this change, just a general observation)

Copy link
Contributor Author

@nkrabben nkrabben Jul 19, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I use it on line 233 to address an issue with getcwd, but didn't want to break too far from existing practice without more consideration. I don't think it would be an issue. Are there any edge cases that would be good to test for?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I couldn't think of any edge cases, but I did have to invoke realpath in the setup class for the tests.


cwd = os.path.abspath(os.path.curdir)

if cwd.startswith(bag_dir) and cwd != bag_dir:
if cwd.startswith(source_dir) and cwd != source_dir:
raise RuntimeError(
_("Bagging a parent of the current directory is not supported")
)

LOGGER.info(_("Creating bag for directory %s"), bag_dir)

if not os.path.isdir(bag_dir):
LOGGER.error(_("Bag directory %s does not exist"), bag_dir)
raise RuntimeError(_("Bag directory %s does not exist") % bag_dir)
LOGGER.info(_("Creating bag from directory %s"), source_dir)

if not os.path.isdir(source_dir):
LOGGER.error(_("Bag source directory %s does not exist"), bag_dir)
raise RuntimeError(_("Bag source directory %s does not exist") % bag_dir)

# FIXME: we should do the permissions checks before changing directories
old_dir = os.path.abspath(os.path.curdir)

try:
try:
# TODO: These two checks are currently redundant since an unreadable directory will also
# often be unwritable, and this code will require review when we add the option to
# bag to a destination other than the source. It would be nice if we could avoid
# walking the directory tree more than once even if most filesystems will cache it

unbaggable = _can_bag(bag_dir)
unbaggable = _can_bag(dest_dir)

if unbaggable:
LOGGER.error(
Expand All @@ -194,7 +206,7 @@ def make_bag(
)
raise BagError(_("Missing permissions to move all files and directories"))

unreadable_dirs, unreadable_files = _can_read(bag_dir)
unreadable_dirs, unreadable_files = _can_read(source_dir)

if unreadable_dirs or unreadable_files:
if unreadable_dirs:
Expand All @@ -214,33 +226,57 @@ def make_bag(
LOGGER.info(_("Creating data directory"))

# FIXME: if we calculate full paths we won't need to deal with changing directories
os.chdir(bag_dir)
os.chdir(source_dir)
cwd = os.getcwd()
temp_data = tempfile.mkdtemp(dir=cwd)

for f in os.listdir("."):
if os.path.abspath(f) == temp_data:
continue
new_f = os.path.join(temp_data, f)
LOGGER.info(
_("Moving %(source)s to %(destination)s"),
{"source": f, "destination": new_f},
)
os.rename(f, new_f)
temp_data = tempfile.mkdtemp(dir=dest_dir)
# getcwd resolves symlinks, dest_dir used abspath, which doesn't
temp_data = os.path.realpath(temp_data)

if source_dir == dest_dir:
for f in os.listdir("."):
if os.path.abspath(f) == temp_data:
continue
new_f = os.path.join(temp_data, f)
LOGGER.info(
_("Moving %(source)s to %(destination)s"),
{"source": f, "destination": new_f},
)
os.rename(f, new_f)
else:
for f in os.listdir("."):
new_f = os.path.join(temp_data, f)
LOGGER.info(
_("Copying %(source)s to %(destination)s"),
{"source": f, "destination": new_f},
)
if os.path.isdir(f):
shutil.copytree(f, new_f)
else:
shutil.copy(f, new_f)

LOGGER.info(
_("Moving %(source)s to %(destination)s"),
{"source": temp_data, "destination": "data"},
)
os.rename(temp_data, "data")

os.rename(temp_data, os.path.join(dest_dir, "data"))

# permissions for the payload directory should match those of the
# original directory
os.chmod("data", os.stat(cwd).st_mode)

total_bytes, total_files = make_manifests(
"data", processes, algorithms=checksums, encoding=encoding
)
os.chmod(os.path.join(dest_dir, "data"), os.stat(cwd).st_mode)

if source_dir == dest_dir:
total_bytes, total_files = make_manifests(
"data", processes, algorithms=checksums, encoding=encoding
)
else:
total_bytes, total_files = make_manifests(
".", processes, algorithms=checksums, encoding=encoding, dest_dir=dest_dir, rel_path="data"
)

os.chdir(dest_dir)
cwd = os.getcwd()

LOGGER.info(_("Creating bagit.txt"))
txt = """BagIt-Version: 0.97\nTag-File-Character-Encoding: UTF-8\n"""
Expand All @@ -264,14 +300,14 @@ def make_bag(
_make_tag_file("bag-info.txt", bag_info)

for c in checksums:
_make_tagmanifest_file(c, bag_dir, encoding="utf-8")
_make_tagmanifest_file(c, dest_dir, encoding="utf-8")
except Exception:
LOGGER.exception(_("An error occurred creating a bag in %s"), bag_dir)
raise
finally:
os.chdir(old_dir)

return Bag(bag_dir)
return Bag(dest_dir)


class Bag(object):
Expand Down Expand Up @@ -1237,13 +1273,18 @@ def _make_tag_file(bag_info_path, bag_info):
f.write("%s: %s\n" % (h, txt))


def make_manifests(data_dir, processes, algorithms=DEFAULT_CHECKSUMS, encoding="utf-8"):
def make_manifests(data_dir, processes, algorithms=DEFAULT_CHECKSUMS, encoding="utf-8", dest_dir=None, rel_path=None):
LOGGER.info(
_("Using %(process_count)d processes to generate manifests: %(algorithms)s"),
{"process_count": processes, "algorithms": ", ".join(algorithms)},
)

manifest_line_generator = partial(generate_manifest_lines, algorithms=algorithms)
if not dest_dir:
dest_dir = os.getcwd()

data_dir = os.path.relpath(data_dir)

manifest_line_generator = partial(generate_manifest_lines, algorithms=algorithms, rel_path=rel_path)

if processes > 1:
pool = multiprocessing.Pool(processes=processes)
Expand All @@ -1266,8 +1307,9 @@ def make_manifests(data_dir, processes, algorithms=DEFAULT_CHECKSUMS, encoding="

for algorithm, values in manifest_data.items():
manifest_filename = "manifest-%s.txt" % algorithm
manifest_path = os.path.join(dest_dir, manifest_filename)

with open_text_file(manifest_filename, "w", encoding=encoding) as manifest:
with open_text_file(manifest_path, "w", encoding=encoding) as manifest:
for digest, filename, byte_count in values:
manifest.write("%s %s\n" % (digest, _encode_filename(filename)))
num_files[algorithm] += 1
Expand Down Expand Up @@ -1386,7 +1428,7 @@ def _can_read(test_dir):
return (tuple(unreadable_dirs), tuple(unreadable_files))


def generate_manifest_lines(filename, algorithms=DEFAULT_CHECKSUMS):
def generate_manifest_lines(filename, algorithms=DEFAULT_CHECKSUMS, rel_path=None):
LOGGER.info(_("Generating manifest lines for file %s"), filename)

# For performance we'll read the file only once and pass it block
Expand All @@ -1408,6 +1450,9 @@ def generate_manifest_lines(filename, algorithms=DEFAULT_CHECKSUMS):

decoded_filename = _decode_filename(filename)

if rel_path:
decoded_filename = os.path.join(rel_path, decoded_filename)

# We'll generate a list of results in roughly manifest format but prefixed with the algorithm:
results = [
(alg, hasher.hexdigest(), decoded_filename, total_bytes)
Expand Down Expand Up @@ -1513,6 +1558,15 @@ def _make_parser():
)
% ", ".join(DEFAULT_CHECKSUMS),
)
parser.add_argument(
"--destination",
type=str,
dest="dest_dir",
default=None,
help=_(
"Create bag in destination directory rather than in place."
),
)

for i in CHECKSUM_ALGOS:
alg_name = re.sub(r"^([A-Z]+)(\d+)$", r"\1-\2", i.upper())
Expand Down Expand Up @@ -1601,13 +1655,20 @@ def main():
bag_info=args.bag_info,
processes=args.processes,
checksums=args.checksums,
dest_dir=args.destination
)
except Exception as exc:
LOGGER.error(
_("Failed to create bag in %(bag_directory)s: %(error)s"),
{"bag_directory": bag_dir, "error": exc},
exc_info=True,
)
if args.dest_dir:
LOGGER.error(_("Failed to create bag in %(bag_directory)s: %(error)s"),
{'bag_directory': args.dest_dir, 'error': exc},
exc_info=True,
)
else:
LOGGER.error(
_("Failed to create bag in %(bag_directory)s: %(error)s"),
{"bag_directory": bag_dir, "error": exc},
exc_info=True,
)
rc = 1

sys.exit(rc)
Expand Down
45 changes: 44 additions & 1 deletion test.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import codecs
import datetime
import filecmp
import hashlib
import logging
import os
Expand Down Expand Up @@ -96,6 +97,48 @@ def test_make_bag_md5_sha1_sha256_manifest(self):
# check valid with three manifests
self.assertTrue(self.validate(bag, fast=True))

def test_make_bag_with_destination(self):
tmp_dir_out = tempfile.mkdtemp(prefix='bagit-test-')
dest_dir = j(tmp_dir_out, 'test-dest')
bag = bagit.make_bag(
self.tmpdir, dest_dir=dest_dir, checksums=['sha256', 'sha512']
)
subdir = os.path.basename(self.tmpdir)
self.assertTrue(os.path.isfile(j(dest_dir, subdir, 'manifest-sha256.txt')))
self.assertTrue(os.path.isfile(j(dest_dir, subdir, 'manifest-sha512.txt')))
self.assertTrue(self.validate(bag, fast=True))
diff = filecmp.dircmp(self.tmpdir, os.path.join(dest_dir, subdir, 'data'))
self.assertTrue(len(diff.left_only+diff.right_only) == 0)
shutil.rmtree(tmp_dir_out)

def test_make_bags_with_destinations(self):
dest = tempfile.mkdtemp(prefix='bagit-test-dest-')
src_par = tempfile.mkdtemp(prefix='bagit-test-src-')
srcs = tuple(os.path.join(src_par, '%04d' % i) for i in range(10))
subdirs = tuple(os.path.relpath(src, src_par) for src in srcs)
for src in srcs:
shutil.copytree('test-data', src)
bag = bagit.make_bag(src, dest_dir=dest, checksum=['sha256'])
self.assertTrue(tuple(sorted(os.listdir(dest))) == subdirs)
for src, subdir in zip(srcs, subdirs):
diff = filecmp.dircmp(src, os.path.join(dest, subdir, 'data'))
self.assertTrue(len(diff.left_only+diff.right_only) == 0)
self.assertTrue(self.validate(bag))
shutil.rmtree(src_par)
shutil.rmtree(dest)

def test_make_bag_bad_destination(self):
tmp_dir_out = tempfile.mkdtemp(prefix='bagit-test-dest')
subdir = os.path.basename(self.tmpdir)
os.makedirs(os.path.join(tmp_dir_out, subdir))

self.assertRaises(
RuntimeError, bagit.make_bag,
self.tmpdir, dest_dir=tmp_dir_out, checksum=['sha256', 'sha512']
)

shutil.rmtree(tmp_dir_out)

def test_validate_flipped_bit(self):
bag = bagit.make_bag(self.tmpdir)
readme = j(self.tmpdir, "data", "README")
Expand Down Expand Up @@ -622,7 +665,7 @@ def test_make_bag_with_bogus_directory(self):
bagit.make_bag(bogus_directory)

self.assertEqual(
"Bag directory %s does not exist" % bogus_directory,
"Bag source directory %s does not exist" % bogus_directory,
str(error_catcher.exception),
)

Expand Down