From 5bb950380dc804907be32d1aba6a10cc1b08cfe3 Mon Sep 17 00:00:00 2001 From: "Spigel, Radim" Date: Thu, 24 Jan 2019 23:16:35 +0100 Subject: [PATCH 1/2] feat: bagitfs --- bagitfs.py | 565 ++++++++++++++++++++++++++++++++++++++++++++++++ test.py | 2 +- test_bagitfs.py | 57 +++++ 3 files changed, 623 insertions(+), 1 deletion(-) create mode 100644 bagitfs.py create mode 100644 test_bagitfs.py diff --git a/bagitfs.py b/bagitfs.py new file mode 100644 index 0000000..2abb2b8 --- /dev/null +++ b/bagitfs.py @@ -0,0 +1,565 @@ +import os +import codecs +import hashlib +import multiprocessing +from bagit import ( + Bag, + BagError, + BagValidationError, + _parse_tags, +CHECKSUM_ALGOS, +UNICODE_BYTE_ORDER_MARK, +LOGGER, +_decode_filename, +normalize_unicode, +HASH_BLOCK_SIZE, +force_unicode, +posix_multiprocessing_worker_initializer, +ChecksumMismatch, +_ +) +from fs.base import FS + +#TODO: move support method to bagit_utils + + +def walk(top, memory_fs ,topdown=True, onerror=None, followlinks=False): + """Directory tree generator. + For each directory in the directory tree rooted at top (including top + itself, but excluding '.' and '..'), yields a 3-tuple + dirpath, dirnames, filenames + dirpath is a string, the path to the directory. dirnames is a list of + the names of the subdirectories in dirpath (excluding '.' and '..'). + filenames is a list of the names of the non-directory files in dirpath. + Note that the names in the lists are just names, with no path components. + To get a full path (which begins with top) to a file or directory in + dirpath, do os.path.join(dirpath, name). + If optional arg 'topdown' is true or not specified, the triple for a + directory is generated before the triples for any of its subdirectories + (directories are generated top down). If topdown is false, the triple + for a directory is generated after the triples for all of its + subdirectories (directories are generated bottom up). + When topdown is true, the caller can modify the dirnames list in-place + (e.g., via del or slice assignment), and walk will only recurse into the + subdirectories whose names remain in dirnames; this can be used to prune the + search, or to impose a specific order of visiting. Modifying dirnames when + topdown is false is ineffective, since the directories in dirnames have + already been generated by the time dirnames itself is generated. No matter + the value of topdown, the list of subdirectories is retrieved before the + tuples for the directory and its subdirectories are generated. + By default errors from the os.scandir() call are ignored. If + optional arg 'onerror' is specified, it should be a function; it + will be called with one argument, an OSError instance. It can + report the error to continue with the walk, or raise the exception + to abort the walk. Note that the filename is available as the + filename attribute of the exception object. + By default, os.walk does not follow symbolic links to subdirectories on + systems that support them. In order to get this functionality, set the + optional argument 'followlinks' to true. + Caution: if you pass a relative pathname for top, don't change the + current working directory between resumptions of walk. walk never + changes the current directory, and assumes that the client doesn't + either. + Example: + import os + from os.path import join, getsize + for root, dirs, files in os.walk('python/Lib/email'): + print(root, "consumes", end="") + print(sum(getsize(join(root, name)) for name in files), end="") + print("bytes in", len(files), "non-directory files") + if 'CVS' in dirs: + dirs.remove('CVS') # don't visit CVS directories + """ + dirs = [] + nondirs = [] + walk_dirs = [] + + # We may not have read permission for top, in which case we can't + # get a list of the files the directory contains. os.walk + # always suppressed the exception then, rather than blow up for a + # minor reason when (say) a thousand readable directories are still + # left to visit. That logic is copied here. + try: + # Note that scandir is global in this module due + # to earlier import-*. + scandir_it = memory_fs.scandir(top) + except OSError as error: + if onerror is not None: + onerror(error) + return + + while True: + try: + try: + entry = next(scandir_it) + except StopIteration: + break + except OSError as error: + if onerror is not None: + onerror(error) + return + try: + is_dir = entry.is_dir + except OSError: + # If is_dir() raises an OSError, consider that the entry is not + # a directory, same behaviour than os.path.isdir(). + is_dir = False + + if is_dir: + dirs.append(entry.name) + else: + nondirs.append(entry.name) + + if not topdown and is_dir: + # Bottom-up: recurse into sub-directory, but exclude symlinks to + # directories if followlinks is False + if followlinks: + walk_into = True + else: + try: + is_symlink = entry.is_symlink() + except OSError: + # If is_symlink() raises an OSError, consider that the + # entry is not a symbolic link, same behaviour than + # os.path.islink(). + is_symlink = False + walk_into = not is_symlink + + if walk_into: + walk_dirs.append(entry.path) + + # Yield before recursion if going top down + if topdown: + yield top, dirs, nondirs + + # Recurse into sub-directories + islink, join = memory_fs.islink, os.path.join + for dirname in dirs: + new_path = join(top, dirname) + # Issue #23605: os.path.islink() is used instead of caching + # entry.is_symlink() result during the loop on os.scandir() because + # the caller can replace the directory entry during the "yield" + # above. + if followlinks or not islink(new_path): + yield from walk(new_path, memory_fs, topdown, onerror, followlinks) + else: + # Recurse into sub-directories + for new_path in walk_dirs: + yield from walk(new_path, memory_fs, topdown, onerror, followlinks) + # Yield after recursion if going bottom up + yield top, dirs, nondirs + + +def _calculate_file_hashes(memory_fs, full_path, f_hashers): + """ + Returns a dictionary of (algorithm, hexdigest) values for the provided + filename + """ + LOGGER.info(_("Verifying checksum for file %s"), full_path) + + try: + with memory_fs.open(full_path, "rb") as f: + while True: + block = f.read(HASH_BLOCK_SIZE) + if not block: + break + for i in f_hashers.values(): + i.update(block) + except (OSError, IOError) as e: + raise BagValidationError( + _("Could not read %(filename)s: %(error)s") + % {"filename": full_path, "error": force_unicode(e)} + ) + + return dict((alg, h.hexdigest()) for alg, h in f_hashers.items()) + +def _calc_hashes(args): + # auto unpacking of sequences illegal in Python3 + (memory_fs, base_path, rel_path, hashes, algorithms) = args + full_path = os.path.join(base_path, rel_path) + + # Create a clone of the default empty hash objects: + f_hashers = dict((alg, hashlib.new(alg)) for alg in hashes if alg in algorithms) + + try: + f_hashes = _calculate_file_hashes(memory_fs, full_path, f_hashers) + except BagValidationError as e: + f_hashes = dict((alg, force_unicode(e)) for alg in f_hashers.keys()) + + return rel_path, f_hashes, hashes + + +class BagFsException(Exception): + pass + +# open_text_file = partial(codecs.open, encoding="utf-8", errors="strict") + +def _load_tag_file(tag_file_name, file_descriptor, encoding="utf-8"): + with file_descriptor.open(tag_file_name, "r", encoding=encoding) as tag_file: + # Store duplicate tags as list of vals + # in order of parsing under the same key. + tags = {} + for name, value in _parse_tags(tag_file): + if name not in tags: + tags[name] = value + continue + + if not isinstance(tags[name], list): + tags[name] = [tags[name], value] + else: + tags[name].append(value) + + return tags + +class BagitFs(Bag): + """ + This module contains basic functionality for checking bagit file which is loaded into virtual + filesystem via MemoryFS library see https://docs.pyfilesystem.org/en/latest/reference/memoryfs.html + Basic use: + with ZipFS('path_to_zip.zip') as zip_fs: + bag = BagitFs(zip_fs) + print(bag.validate()) + """ + def __init__(self, memory_fs, path="/"): + # type: (FS, str) -> None + # sending None as path to parent class constructor + self.tags = {} + self.info = {} + #: Dictionary of manifest entries and the checksum values for each + #: algorithm: + self.entries = {} + + # To reliably handle Unicode normalization differences, we maintain + # lookup dictionaries in both directions for the filenames read from + # the filesystem and the manifests so we can handle cases where the + # normalization form changed between the bag being created and read. + # See https://github.com/LibraryOfCongress/bagit-python/issues/51. + + #: maps Unicode-normalized values to the raw value from the filesystem + self.normalized_filesystem_names = {} + + #: maps Unicode-normalized values to the raw value in the manifest + self.normalized_manifest_names = {} + + self.algorithms = [] + self.tag_file_name = None + if not isinstance(memory_fs, FS): + raise BagFsException("BagitFs requires FS instance.") + self.path = path + self.memory_fs = memory_fs # type: FS + self._open() + + def __str__(self): + return "Bagit in memory {}".format(self.memory_fs._path_to_zip_name) + + def manifest_files(self): + for filename in ["manifest-%s.txt" % a for a in CHECKSUM_ALGOS]: + file_path = os.path.join(self.path, filename) + if self.memory_fs.isfile(file_path): + yield file_path + + def tagmanifest_files(self): + for filename in ["tagmanifest-%s.txt" % a for a in CHECKSUM_ALGOS]: + file_path = os.path.join(self.path, filename) + if self.memory_fs.isfile(file_path): + yield file_path + + def _open(self): + # Open the bagit.txt file, and load any tags from it, including + # the required version and encoding. + + bagit_file_path = os.path.join(self.path, "bagit.txt") + + if not self.memory_fs.isfile(bagit_file_path): + raise BagError(_("Expected bagit.txt does not exist: %s") % bagit_file_path) + + self.tags = tags = _load_tag_file(bagit_file_path, self.memory_fs) + + required_tags = ("BagIt-Version", "Tag-File-Character-Encoding") + missing_tags = [i for i in required_tags if i not in tags] + if missing_tags: + raise BagError( + _("Missing required tag in bagit.txt: %s") % ", ".join(missing_tags) + ) + + # To avoid breaking existing code we'll leave self.version as the string + # and parse it into a numeric version_info tuple. In version 2.0 we can + # break that. + + self._version = tags["BagIt-Version"] + + try: + self.version_info = tuple(int(i) for i in self._version.split(".", 1)) + except ValueError: + raise BagError( + _("Bag version numbers must be MAJOR.MINOR numbers, not %s") + % self._version + ) + + if (0, 93) <= self.version_info <= (0, 95): + self.tag_file_name = "package-info.txt" + elif (0, 96) <= self.version_info < (2,): + self.tag_file_name = "bag-info.txt" + else: + raise BagError(_("Unsupported bag version: %s") % self._version) + + self.encoding = tags["Tag-File-Character-Encoding"] + + try: + codecs.lookup(self.encoding) + except LookupError: + raise BagValidationError(_("Unsupported encoding: %s") % self.encoding) + + info_file_path = os.path.join(self.path, self.tag_file_name) + if os.path.exists(info_file_path): + self.info = _load_tag_file(info_file_path, self.memory_fs, encoding=self.encoding) + + self._load_manifests() + + def _load_manifests(self): + self.entries = {} + manifests = list(self.manifest_files()) + + if self.version_info >= (0, 97): + # v0.97+ requires that optional tagfiles are verified. + manifests += list(self.tagmanifest_files()) + + for manifest_filename in manifests: + if not manifest_filename.find("tagmanifest-") is -1: + search = "tagmanifest-" + else: + search = "manifest-" + alg = ( + os.path.basename(manifest_filename) + .replace(search, "") + .replace(".txt", "") + ) + if alg not in self.algorithms: + self.algorithms.append(alg) + + with self.memory_fs.open(manifest_filename, encoding=self.encoding) as manifest_file: + if manifest_file.encoding.startswith("UTF"): + # We'll check the first character to see if it's a BOM: + if manifest_file.read(1) == UNICODE_BYTE_ORDER_MARK: + # We'll skip it either way by letting line decoding + # happen at the new offset but we will issue a warning + # for UTF-8 since the presence of a BOM is contrary to + # the BagIt specification: + if manifest_file.encoding == "UTF-8": + LOGGER.warning( + _( + "%s is encoded using UTF-8 but contains an unnecessary" + " byte-order mark, which is not in compliance with the" + " BagIt RFC" + ), + manifest_file.name, + ) + else: + manifest_file.seek(0) # Pretend the first read never happened + + for line in manifest_file: + line = line.strip() + + # Ignore blank lines and comments. + if line == "" or line.startswith("#"): + continue + + entry = line.split(None, 1) + + # Format is FILENAME *CHECKSUM + if len(entry) != 2: + LOGGER.error( + _( + "%(bag)s: Invalid %(algorithm)s manifest entry: %(line)s" + ), + {"bag": self, "algorithm": alg, "line": line}, + ) + continue + + entry_hash = entry[0] + entry_path = os.path.normpath(entry[1].lstrip("*")) + entry_path = _decode_filename(entry_path) + + if self._path_is_dangerous(entry_path): + raise BagError( + _( + 'Path "%(payload_file)s" in manifest "%(manifest_file)s" is unsafe' + ) + % { + "payload_file": entry_path, + "manifest_file": manifest_file.name, + } + ) + + entry_hashes = self.entries.setdefault(entry_path, {}) + + if alg in entry_hashes: + warning_ctx = { + "bag": self, + "algorithm": alg, + "filename": entry_path, + } + if entry_hashes[alg] == entry_hash: + msg = _( + "%(bag)s: %(algorithm)s manifest lists %(filename)s" + " multiple times with the same value" + ) + if self.version_info >= (1,): + raise BagError(msg % warning_ctx) + else: + LOGGER.warning(msg, warning_ctx) + else: + raise BagError( + _( + "%(bag)s: %(algorithm)s manifest lists %(filename)s" + " multiple times with conflicting values" + ) + % warning_ctx + ) + + entry_hashes[alg] = entry_hash + + self.normalized_manifest_names.update( + (normalize_unicode(i), i) for i in self.entries.keys() + ) + + def _validate_structure_payload_directory(self): + data_dir_path = os.path.join(self.path, "data") + + if not self.memory_fs.isdir(data_dir_path): + raise BagValidationError( + _("Expected data directory %s does not exist") % data_dir_path + ) + def _validate_structure_tag_files(self): + # Note: we deviate somewhat from v0.96 of the spec in that it allows + # other files and directories to be present in the base directory + + if not list(self.manifest_files()): + raise BagValidationError(_("No manifest files found")) + if "bagit.txt" not in self.memory_fs.listdir(self.path): + raise BagValidationError( + _('Expected %s to contain "bagit.txt"') % self.path + ) + + def _validate_bagittxt(self): + """ + Verify that bagit.txt conforms to specification + """ + bagit_file_path = os.path.join(self.path, "bagit.txt") + + # Note that we are intentionally opening this file in binary mode so we can confirm + # that it does not start with the UTF-8 byte-order-mark + with self.memory_fs.open(bagit_file_path, "rb") as bagit_file: + first_line = bagit_file.read(4) + if first_line.startswith(codecs.BOM_UTF8): + raise BagValidationError( + _("bagit.txt must not contain a byte-order mark") + ) + + def fetch_entries(self): + """Load fetch.txt if present and iterate over its contents + + yields (url, size, filename) tuples + + raises BagError for errors such as an unsafe filename referencing + data outside of the bag directory + """ + + fetch_file_path = os.path.join(self.path, "fetch.txt") + + if self.memory_fs.isfile(fetch_file_path): + with self.memory_fs.open( + fetch_file_path, "r", encoding=self.encoding + ) as fetch_file: + for line in fetch_file: + url, file_size, filename = line.strip().split(None, 2) + + if self._path_is_dangerous(filename): + raise BagError( + _('Path "%(payload_file)s" in "%(source_file)s" is unsafe') + % { + "payload_file": filename, + "source_file": os.path.join(self.path, "fetch.txt"), + } + ) + + yield url, file_size, filename + + def payload_files(self): + """Returns a list of filenames which are present on the local filesystem""" + payload_dir = os.path.join(self.path, "data") + for dirpath, _, filenames in walk(payload_dir, memory_fs=self.memory_fs): + for f in filenames: + # Jump through some hoops here to make the payload files are + # returned with the directory structure relative to the base + # directory rather than the + normalized_f = os.path.normpath(f) + rel_path = os.path.relpath( + os.path.join(dirpath, normalized_f), start=self.path + ) + + self.normalized_filesystem_names[normalize_unicode(rel_path)] = rel_path + yield rel_path + + def missing_optional_tagfiles(self): + """ + From v0.97 we need to validate any tagfiles listed + in the optional tagmanifest(s). As there is no mandatory + directory structure for additional tagfiles we can + only check for entries with missing files (not missing + entries for existing files). + """ + for tagfilepath in self.tagfile_entries().keys(): + if not self.memory_fs.isfile(os.path.join(self.path, tagfilepath)): + yield tagfilepath + + def _validate_entries(self, processes): + """ + Verify that the actual file contents match the recorded hashes stored in the manifest files + """ + errors = list() + + if os.name == "posix": + worker_init = posix_multiprocessing_worker_initializer + else: + worker_init = None + + args = ( + ( + self.memory_fs, + self.path, + self.normalized_filesystem_names.get(rel_path, rel_path), + hashes, + self.algorithms, + ) + for rel_path, hashes in self.entries.items() + ) + try: + if processes == 1: + hash_results = [_calc_hashes(i) for i in args] + else: + try: + pool = multiprocessing.Pool( + processes if processes else None, initializer=worker_init + ) + hash_results = pool.map(_calc_hashes, args) + finally: + pool.terminate() + + # Any unhandled exceptions are probably fatal + except: + LOGGER.exception(_("Unable to calculate file hashes for %s"), self) + raise + + for rel_path, f_hashes, hashes in hash_results: + for alg, computed_hash in f_hashes.items(): + stored_hash = hashes[alg] + if stored_hash.lower() != computed_hash: + e = ChecksumMismatch( + rel_path, alg, stored_hash.lower(), computed_hash + ) + LOGGER.warning(force_unicode(e)) + errors.append(e) + + if errors: + raise BagValidationError(_("Bag validation failed"), errors) diff --git a/test.py b/test.py index eab3d95..c45a626 100644 --- a/test.py +++ b/test.py @@ -15,7 +15,7 @@ import unittest from os.path import join as j -import mock +from unittest import mock import bagit diff --git a/test_bagitfs.py b/test_bagitfs.py new file mode 100644 index 0000000..e342c34 --- /dev/null +++ b/test_bagitfs.py @@ -0,0 +1,57 @@ +# encoding: utf-8 + +from __future__ import absolute_import, division, print_function, unicode_literals + +import codecs +import datetime +import hashlib +import logging +import os +import shutil +import stat +import sys +import tempfile +import unicodedata +import unittest +from os.path import join as j + +from unittest import mock + +import bagitfs +import bagit +from test import SelfCleaningTestCase + +from fs.zipfs import ZipFS + +logging.basicConfig(filename="test.log", level=logging.DEBUG) +stderr = logging.StreamHandler() +stderr.setLevel(logging.WARNING) +logging.getLogger().addHandler(stderr) + +# But we do want any exceptions raised in the logging path to be raised: +logging.raiseExceptions = True + +@mock.patch( + "bagit.VERSION", new="1.5.4" +) # This avoids needing to change expected hashes on each release +class TestZipValidation(SelfCleaningTestCase): + def test_wrong_bagit_zip_open(self): + with ZipFS("./test-data.zip") as zip_fs: + try: + bag = bagitfs.BagitFs(zip_fs) + except bagit.BagError as e: + self.assertEqual("Expected bagit.txt does not exist: /bagit.txt", str(e)) + + def test_correct_bagit_zip(self): + bag = bagit.make_bag(self.tmpdir, checksum=["sha1", "sha256"]) + # check that relevant manifests are created + self.assertTrue(os.path.isfile(j(self.tmpdir, "manifest-sha1.txt"))) + self.assertTrue(os.path.isfile(j(self.tmpdir, "manifest-sha256.txt"))) + + shutil.make_archive(os.path.join(self.tmpdir, "bag-correct_zip"), 'zip', self.tmpdir) + with ZipFS(os.path.join(self.tmpdir, "bag-correct_zip.zip")) as zip_fs: + bag = bagitfs.BagitFs(zip_fs) + bag.validate() + +if __name__ == "__main__": + unittest.main() From 1ec9624b74a635f43ff5c713066055c02aa3431e Mon Sep 17 00:00:00 2001 From: "Spigel, Radim" Date: Wed, 26 Feb 2020 17:51:50 +0100 Subject: [PATCH 2/2] chore: support 3.7 --- pyproject.toml | 16 ++++++++++++++++ setup.py | 3 ++- 2 files changed, 18 insertions(+), 1 deletion(-) create mode 100644 pyproject.toml diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..b8d64a1 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,16 @@ +[tool.poetry] +name = "bagitfs" +version = "0.1.0" +description = "Bagit package" +authors = ["Radim Spigel "] + +[tool.poetry.dependencies] +python = "^3.6" +bagit = "1.7.0" + +[tool.poetry.dev-dependencies] +pytest = "^3.0" + +[build-system] +requires = ["poetry>=0.12", "bagit"] +build-backend = "poetry.masonry.api" diff --git a/setup.py b/setup.py index 0374e01..0759b12 100644 --- a/setup.py +++ b/setup.py @@ -55,7 +55,7 @@ def get_message_catalogs(): author="Ed Summers", author_email="ehs@pobox.com", py_modules=["bagit"], - scripts=["bagit.py"], + scripts=["bagit.py", "bagitfs.py"], data_files=get_message_catalogs(), description=description, long_description=long_description, @@ -76,5 +76,6 @@ def get_message_catalogs(): "Programming Language :: Python :: 3.4", "Programming Language :: Python :: 3.5", "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", ], )