From dae7b40546f117018892a19a6b0833dc8ff7b041 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 27 Nov 2018 15:21:41 +0100 Subject: [PATCH 1/7] Minimal implementation of fetching entries of fetch.txt, #118 --- .gitignore | 7 ++++++- bagit.py | 42 ++++++++++++++++++++++++++++++++++++++++-- test.py | 27 +++++++++++++++++++++++++++ 3 files changed, 73 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 1f4b930..a3050ee 100644 --- a/.gitignore +++ b/.gitignore @@ -3,5 +3,10 @@ bench-data build dist MANIFEST -bagit.egg-info .idea +test.log +*.egg-info +.eggs +*.egg +.tox +locale/**/*.mo diff --git a/bagit.py b/bagit.py index a821973..f10e124 100755 --- a/bagit.py +++ b/bagit.py @@ -14,6 +14,7 @@ import signal import sys import tempfile +import urllib import unicodedata import warnings from collections import defaultdict @@ -23,9 +24,12 @@ from pkg_resources import DistributionNotFound, get_distribution -try: +# pylint: disable=no-name-in-module, import-error, wrong-import-position +if sys.version_info >= (3,): from urllib.parse import urlparse -except ImportError: + from urllib.request import urlopen, FancyURLopener +else: + from urllib import urlopen, FancyURLopener from urlparse import urlparse @@ -582,6 +586,37 @@ def files_to_be_fetched(self): for url, file_size, filename in self.fetch_entries(): yield filename + def fetch_files_to_be_fetched(self): + """ + Fetches files from the fetch.txt + """ + urllib._urlopener = BagFetcherURLOpener # pylint: disable=protected-access + for url, expected_size, filename in self.fetch_entries(): + expected_size = int(expected_size) # FIXME should be int in the first place + if filename in self.payload_files(): + LOGGER.info(_("File already fetched: %s"), filename) + continue + resp = urlopen(url) + headers = resp.info() + if "content-length" not in headers: + LOGGER.warning(_("Server sent no content-length for <%s>"), url) + else: + content_length = int(headers['content-length']) + if content_length != expected_size: + raise BagError(_("Inconsistent size of %s: Expected %s but Content-Length is %s") % (filename, expected_size, content_length)) + with open(join(self.path, filename), 'wb') as out: + read = 0 + while True: + block = resp.read(1024 * 8) + if not block: + break + read += len(block) + out.write(block) + if read != expected_size: + raise BagError(_("Inconsistent size of %s: Expected %s but received %s") % (filename, expected_size, read)) + LOGGER.info(_("Fetched %s from %s"), filename, url) + + def has_oxum(self): return "Payload-Oxum" in self.info @@ -767,6 +802,7 @@ def validate_fetch(self): # well formed: parsed_url = urlparse(url) + # ensure url is a remote URL, not file:// if not all((parsed_url.scheme, parsed_url.netloc)): raise BagError(_("Malformed URL in fetch.txt: %s") % url) @@ -937,6 +973,8 @@ def _path_is_dangerous(self, path): common = os.path.commonprefix((bag_path, real_path)) return not (common == bag_path) +class BagFetcherURLOpener(FancyURLopener): + version = "bagit.py/%s (Python/%s)" % (VERSION, sys.version) class BagError(Exception): pass diff --git a/test.py b/test.py index eab3d95..8dd21bc 100644 --- a/test.py +++ b/test.py @@ -1099,6 +1099,33 @@ def test_fetch_malformed_url(self): self.assertEqual(expected_msg, str(cm.exception)) + # FIXME: Won't work since file:// URLs are rejected + # def test_fetching_payload_file(self): + # with open(j(self.tmpdir, "mock_data"), "w") as mock_data: + # print("Lorem ipsum dolor sit", file=mock_data) + # with open(j(self.tmpdir, "fetch.txt"), "w") as fetch_txt: + # print("file://%s 21 data/mock_data" % j(self.tmpdir, "mock_data"), file=fetch_txt) + # self.bag.save(manifests=True) + # self.bag.validate_fetch() + + def test_fetching_payload_file(self): + test_payload = 'loc/2478433644_2839c5e8b8_o_d.jpg' + with open(j(self.tmpdir, "fetch.txt"), "w") as fetch_txt: + print("https://github.com/LibraryOfCongress/bagit-python/raw/master/test-data/%s %s data/%s" % ( + test_payload, 139367, test_payload), file=fetch_txt) + self.bag.save(manifests=True) + # should be valid + self.bag.validate() + # now delete the payload, should be invalid + os.unlink(j(self.tmpdir, "data", test_payload)) + self.assertEqual(len(self.bag.compare_fetch_with_fs()), 1, '1 file to fetch') + with self.assertRaises(bagit.BagError): + self.bag.validate() + # re-fetch it + self.bag.fetch_files_to_be_fetched() + # should be valid again + self.bag.validate() + self.assertEqual(len(self.bag.compare_fetch_with_fs()), 0, 'complete') class TestUtils(unittest.TestCase): def setUp(self): From aae8cc31c4138299ee9c918f4d3dd0fa0cd386a7 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 27 Nov 2018 15:52:01 +0100 Subject: [PATCH 2/7] update locale --- locale/bagit-python.pot | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/locale/bagit-python.pot b/locale/bagit-python.pot index 3543c31..04b1ddd 100644 --- a/locale/bagit-python.pot +++ b/locale/bagit-python.pot @@ -8,7 +8,7 @@ msgid "" msgstr "" "Project-Id-Version: PACKAGE VERSION\n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2018-06-26 10:28-0400\n" +"POT-Creation-Date: 2018-11-27 15:21+0100\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language-Team: LANGUAGE \n" @@ -168,6 +168,26 @@ msgstr "" msgid "Path \"%(payload_file)s\" in \"%(source_file)s\" is unsafe" msgstr "" +#, python-format +msgid "File already fetched: %s" +msgstr "" + +#, python-format +msgid "Server sent no content-length for <%s>" +msgstr "" + +#, python-format +msgid "Inconsistent size of %s: Expected %s but Content-Length is %s" +msgstr "" + +#, python-format +msgid "Inconsistent size of %s: Expected %s but received %s" +msgstr "" + +#, python-format +msgid "Fetched %s from %s" +msgstr "" + #, python-format msgid "" "%s is encoded using UTF-8 but contains an unnecessary byte-order mark, which " From 6f937868ddfc1951b4919fbc95ebcb4cd1d37cce Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 27 Nov 2018 17:38:41 +0100 Subject: [PATCH 3/7] properly set user-agent for py2/py3 --- bagit.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/bagit.py b/bagit.py index f10e124..869edb3 100755 --- a/bagit.py +++ b/bagit.py @@ -28,6 +28,7 @@ if sys.version_info >= (3,): from urllib.parse import urlparse from urllib.request import urlopen, FancyURLopener + import urllib.request else: from urllib import urlopen, FancyURLopener from urlparse import urlparse @@ -590,7 +591,10 @@ def fetch_files_to_be_fetched(self): """ Fetches files from the fetch.txt """ - urllib._urlopener = BagFetcherURLOpener # pylint: disable=protected-access + if sys.version_info >= (3,): + urllib.request._urlopener = BagFetcherURLOpener() # pylint: disable=protected-access + else: + urllib._urlopener = BagFetcherURLOpener() # pylint: disable=protected-access for url, expected_size, filename in self.fetch_entries(): expected_size = int(expected_size) # FIXME should be int in the first place if filename in self.payload_files(): @@ -974,7 +978,7 @@ def _path_is_dangerous(self, path): return not (common == bag_path) class BagFetcherURLOpener(FancyURLopener): - version = "bagit.py/%s (Python/%s)" % (VERSION, sys.version) + version = "bagit.py/%s (Python/%s)" % (VERSION, sys.version_info) class BagError(Exception): pass From ac5ea43b69788979a03c058bf18dd62b0ef8fac1 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 27 Nov 2018 18:50:30 +0100 Subject: [PATCH 4/7] revert gitignore changes so git clean will do its thing --- .gitignore | 6 ------ 1 file changed, 6 deletions(-) diff --git a/.gitignore b/.gitignore index a3050ee..8fec75f 100644 --- a/.gitignore +++ b/.gitignore @@ -4,9 +4,3 @@ build dist MANIFEST .idea -test.log -*.egg-info -.eggs -*.egg -.tox -locale/**/*.mo From 91f5f4d6e285404e45b6424e1e00af4f3f973a0b Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Tue, 27 Nov 2018 19:25:27 +0100 Subject: [PATCH 5/7] replace urlopen with non-global urllib2/urllib.request solution --- bagit.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/bagit.py b/bagit.py index 869edb3..65c537a 100755 --- a/bagit.py +++ b/bagit.py @@ -14,7 +14,6 @@ import signal import sys import tempfile -import urllib import unicodedata import warnings from collections import defaultdict @@ -27,10 +26,9 @@ # pylint: disable=no-name-in-module, import-error, wrong-import-position if sys.version_info >= (3,): from urllib.parse import urlparse - from urllib.request import urlopen, FancyURLopener - import urllib.request + from urllib.request import ProxyHandler, Request, build_opener else: - from urllib import urlopen, FancyURLopener + from urllib2 import ProxyHandler, Request, build_opener from urlparse import urlparse @@ -591,16 +589,17 @@ def fetch_files_to_be_fetched(self): """ Fetches files from the fetch.txt """ - if sys.version_info >= (3,): - urllib.request._urlopener = BagFetcherURLOpener() # pylint: disable=protected-access - else: - urllib._urlopener = BagFetcherURLOpener() # pylint: disable=protected-access + proxy_handler = ProxyHandler() # will default to adhere to *_proxy env vars + opener = build_opener(proxy_handler) + user_agent = "bagit.py/%s (Python/%s)" % (VERSION, sys.version_info) for url, expected_size, filename in self.fetch_entries(): expected_size = int(expected_size) # FIXME should be int in the first place if filename in self.payload_files(): LOGGER.info(_("File already fetched: %s"), filename) continue - resp = urlopen(url) + req = Request(url) + req.add_header('User-Agent', user_agent) + resp = opener.open(req) headers = resp.info() if "content-length" not in headers: LOGGER.warning(_("Server sent no content-length for <%s>"), url) @@ -977,8 +976,6 @@ def _path_is_dangerous(self, path): common = os.path.commonprefix((bag_path, real_path)) return not (common == bag_path) -class BagFetcherURLOpener(FancyURLopener): - version = "bagit.py/%s (Python/%s)" % (VERSION, sys.version_info) class BagError(Exception): pass From cacf9d9474c9b50d6efba69e8ff4ff7b0745de5d Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Mon, 3 Dec 2018 12:22:19 +0100 Subject: [PATCH 6/7] Implement a whitelist mechanism to validate fetch entries By default only allow HTTP(S) and (S)FTP URL --- bagit.py | 47 +++++++++++++++++++-------------- locale/bagit-python.pot | 10 +++---- test.py | 58 +++++++++++++++++++++++------------------ 3 files changed, 65 insertions(+), 50 deletions(-) diff --git a/bagit.py b/bagit.py index 65c537a..1194c61 100755 --- a/bagit.py +++ b/bagit.py @@ -16,6 +16,7 @@ import tempfile import unicodedata import warnings +from fnmatch import fnmatch from collections import defaultdict from datetime import date from functools import partial @@ -127,6 +128,7 @@ def find_locale_dir(): CHECKSUM_ALGOS = hashlib.algorithms_guaranteed DEFAULT_CHECKSUMS = ["sha256", "sha512"] +DEFAULT_FETCH_URL_WHITELIST = ["https://*", "http://*", "ftp://*", "sftp://"] #: Block size used when reading files for hashing: HASH_BLOCK_SIZE = 512 * 1024 @@ -140,7 +142,7 @@ def find_locale_dir(): def make_bag( - bag_dir, bag_info=None, processes=1, checksums=None, checksum=None, encoding="utf-8" + bag_dir, bag_info=None, processes=1, checksums=None, checksum=None, encoding="utf-8", fetch_url_whitelist=None ): """ Convert a given directory into a bag. You can pass in arbitrary @@ -278,7 +280,7 @@ class Bag(object): valid_files = ["bagit.txt", "fetch.txt"] valid_directories = ["data"] - def __init__(self, path=None): + def __init__(self, path=None, fetch_url_whitelist=None): super(Bag, self).__init__() self.tags = {} self.info = {} @@ -299,6 +301,7 @@ def __init__(self, path=None): self.normalized_manifest_names = {} self.algorithms = [] + self.fetch_url_whitelist = DEFAULT_FETCH_URL_WHITELIST if fetch_url_whitelist is None else fetch_url_whitelist self.tag_file_name = None self.path = abspath(path) if path: @@ -582,7 +585,7 @@ def files_to_be_fetched(self): local filename """ - for url, file_size, filename in self.fetch_entries(): + for _, _, filename in self.fetch_entries(): yield filename def fetch_files_to_be_fetched(self): @@ -593,7 +596,9 @@ def fetch_files_to_be_fetched(self): opener = build_opener(proxy_handler) user_agent = "bagit.py/%s (Python/%s)" % (VERSION, sys.version_info) for url, expected_size, filename in self.fetch_entries(): - expected_size = int(expected_size) # FIXME should be int in the first place + if not fnmatch_any(url, self.fetch_url_whitelist): + raise BagError(_("Malformed URL in fetch.txt: %s, matches none of the whitelisted URL patterns %s") % (url, self.fetch_url_whitelist)) + expected_size = -1 if expected_size == '-' else int(expected_size) if filename in self.payload_files(): LOGGER.info(_("File already fetched: %s"), filename) continue @@ -601,12 +606,13 @@ def fetch_files_to_be_fetched(self): req.add_header('User-Agent', user_agent) resp = opener.open(req) headers = resp.info() - if "content-length" not in headers: - LOGGER.warning(_("Server sent no content-length for <%s>"), url) - else: - content_length = int(headers['content-length']) - if content_length != expected_size: - raise BagError(_("Inconsistent size of %s: Expected %s but Content-Length is %s") % (filename, expected_size, content_length)) + if expected_size >= 0: + if "content-length" not in headers: + LOGGER.warning(_("Server sent no content-length for <%s>"), url) + else: + content_length = int(headers['content-length']) + if content_length != expected_size: + raise BagError(_("Inconsistent size of %s: Expected %s but Content-Length is %s") % (filename, expected_size, content_length)) with open(join(self.path, filename), 'wb') as out: read = 0 while True: @@ -615,7 +621,7 @@ def fetch_files_to_be_fetched(self): break read += len(block) out.write(block) - if read != expected_size: + if expected_size >= 0 and read != expected_size: raise BagError(_("Inconsistent size of %s: Expected %s but received %s") % (filename, expected_size, read)) LOGGER.info(_("Fetched %s from %s"), filename, url) @@ -799,15 +805,10 @@ def validate_fetch(self): Raises `BagError` for errors and otherwise returns no value """ - for url, file_size, filename in self.fetch_entries(): - # fetch_entries will raise a BagError for unsafe filenames - # so at this point we will check only that the URL is minimally - # well formed: - parsed_url = urlparse(url) - - # ensure url is a remote URL, not file:// - if not all((parsed_url.scheme, parsed_url.netloc)): - raise BagError(_("Malformed URL in fetch.txt: %s") % url) + for url, expected_size, filename in self.fetch_entries(): + # ensure url matches one of the allowed patterns + if not fnmatch_any(url, self.fetch_url_whitelist): + raise BagError(_("Malformed URL in fetch.txt: %s, matches none of the whitelisted URL patterns %s") % (url, self.fetch_url_whitelist)) def _validate_contents(self, processes=1, fast=False, completeness_only=False): if fast and not self.has_oxum(): @@ -1450,6 +1451,12 @@ def generate_manifest_lines(filename, algorithms=DEFAULT_CHECKSUMS): return results +# Return true if any of the pattern fnmatches a string +def fnmatch_any(s, pats): + for pat in pats: + if fnmatch(s, pat): + return True + return False def _encode_filename(s): s = s.replace("\r", "%0D") diff --git a/locale/bagit-python.pot b/locale/bagit-python.pot index 04b1ddd..06294d8 100644 --- a/locale/bagit-python.pot +++ b/locale/bagit-python.pot @@ -8,7 +8,7 @@ msgid "" msgstr "" "Project-Id-Version: PACKAGE VERSION\n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2018-11-27 15:21+0100\n" +"POT-Creation-Date: 2018-12-03 12:06+0100\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language-Team: LANGUAGE \n" @@ -168,6 +168,10 @@ msgstr "" msgid "Path \"%(payload_file)s\" in \"%(source_file)s\" is unsafe" msgstr "" +#, python-format +msgid "Malformed URL in fetch.txt: %s, matches none of the whitelisted URL patterns %s" +msgstr "" + #, python-format msgid "File already fetched: %s" msgstr "" @@ -225,10 +229,6 @@ msgstr "" msgid "Expected %s to contain \"bagit.txt\"" msgstr "" -#, python-format -msgid "Malformed URL in fetch.txt: %s" -msgstr "" - msgid "Fast validation requires bag-info.txt to include Payload-Oxum" msgstr "" diff --git a/test.py b/test.py index 8dd21bc..268394d 100644 --- a/test.py +++ b/test.py @@ -1081,33 +1081,41 @@ def test_fetch_unsafe_payloads(self): self.assertEqual(expected_msg, str(cm.exception)) - def test_fetch_malformed_url(self): - with open(j(self.tmpdir, "fetch.txt"), "w") as fetch_txt: - print( - "//photojournal.jpl.nasa.gov/jpeg/PIA21390.jpg - data/nasa/PIA21390.jpg", - file=fetch_txt, - ) - - self.bag.save(manifests=True) - - expected_msg = ( - "Malformed URL in fetch.txt: //photojournal.jpl.nasa.gov/jpeg/PIA21390.jpg" - ) - - with self.assertRaises(bagit.BagError) as cm: + def test_invalid_urls(self): + invalid_urls = [ + '//photojournal.jpl.nasa.gov/jpeg/PIA21390.jpg', + 'file://%s' % j(self.tmpdir, "mock_data"), + '../../../../../etc/passwd', + ] + for url in invalid_urls: + with open(j(self.tmpdir, "fetch.txt"), "w") as fetch_txt: + print("%s - data/mock_data" % url, file=fetch_txt) + with self.assertRaisesRegexp(bagit.BagError, "^Malformed URL in fetch.txt: %s" % url): + self.bag.validate_fetch() + + def test_invalid_urls_whitelist(self): + self.bag.fetch_url_whitelist = [ + 'https://my.inst.edu/data/*.png' + ] + valid_urls = [ + 'https://my.inst.edu/data/foo.png' + ] + invalid_urls = [ + 'https://my.inst.edu/data/foo' + 'https://my.inst.edu/robots.txt', + 'http://my.inst.edu/data/foo', + 'https://example.org', + ] + for url in invalid_urls: + with open(j(self.tmpdir, "fetch.txt"), "w") as fetch_txt: + print("%s - data/mock_data" % url, file=fetch_txt) + with self.assertRaisesRegexp(bagit.BagError, "^Malformed URL in fetch.txt: %s" % url): + self.bag.validate_fetch() + for url in valid_urls: + with open(j(self.tmpdir, "fetch.txt"), "w") as fetch_txt: + print("%s - data/mock_data" % url, file=fetch_txt) self.bag.validate_fetch() - self.assertEqual(expected_msg, str(cm.exception)) - - # FIXME: Won't work since file:// URLs are rejected - # def test_fetching_payload_file(self): - # with open(j(self.tmpdir, "mock_data"), "w") as mock_data: - # print("Lorem ipsum dolor sit", file=mock_data) - # with open(j(self.tmpdir, "fetch.txt"), "w") as fetch_txt: - # print("file://%s 21 data/mock_data" % j(self.tmpdir, "mock_data"), file=fetch_txt) - # self.bag.save(manifests=True) - # self.bag.validate_fetch() - def test_fetching_payload_file(self): test_payload = 'loc/2478433644_2839c5e8b8_o_d.jpg' with open(j(self.tmpdir, "fetch.txt"), "w") as fetch_txt: From 6fda25c0103ec38c040052b57f5aec83f3dc1840 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Fri, 7 Dec 2018 17:35:58 +0100 Subject: [PATCH 7/7] Rename fetch_files_to_be_fetched -> fetch, force option --- bagit.py | 7 +++++-- test.py | 25 ++++++++++++++++++++++++- 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/bagit.py b/bagit.py index 1194c61..4aa6186 100755 --- a/bagit.py +++ b/bagit.py @@ -588,9 +588,12 @@ def files_to_be_fetched(self): for _, _, filename in self.fetch_entries(): yield filename - def fetch_files_to_be_fetched(self): + def fetch(self, force=False): """ Fetches files from the fetch.txt + + Arguments: + force (boolean): Fetch files even when they are present in the data directory """ proxy_handler = ProxyHandler() # will default to adhere to *_proxy env vars opener = build_opener(proxy_handler) @@ -599,7 +602,7 @@ def fetch_files_to_be_fetched(self): if not fnmatch_any(url, self.fetch_url_whitelist): raise BagError(_("Malformed URL in fetch.txt: %s, matches none of the whitelisted URL patterns %s") % (url, self.fetch_url_whitelist)) expected_size = -1 if expected_size == '-' else int(expected_size) - if filename in self.payload_files(): + if filename in self.payload_files() and not force: LOGGER.info(_("File already fetched: %s"), filename) continue req = Request(url) diff --git a/test.py b/test.py index 268394d..8fcc7c8 100644 --- a/test.py +++ b/test.py @@ -1130,7 +1130,30 @@ def test_fetching_payload_file(self): with self.assertRaises(bagit.BagError): self.bag.validate() # re-fetch it - self.bag.fetch_files_to_be_fetched() + self.bag.fetch() + # should be valid again + self.bag.validate() + self.assertEqual(len(self.bag.compare_fetch_with_fs()), 0, 'complete') + + def test_force_fetching(self): + test_payload = 'loc/2478433644_2839c5e8b8_o_d.jpg' + with open(j(self.tmpdir, "fetch.txt"), "w") as fetch_txt: + print("https://github.com/LibraryOfCongress/bagit-python/raw/master/test-data/%s %s data/%s" % ( + test_payload, 139367, test_payload), file=fetch_txt) + self.bag.save(manifests=True) + # now replace one payload file with an empty string + with open(j(self.tmpdir, "data", test_payload), 'w') as payload: + payload.write('') + # should be invalid now + with self.assertRaisesRegexp(bagit.BagError, "^Payload-Oxum validation failed."): + self.bag.validate() + # non-forcefully downloading should not help + self.bag.fetch() + # should **still* be invalid now + with self.assertRaisesRegexp(bagit.BagError, "^Payload-Oxum validation failed."): + self.bag.validate() + # fetch with force + self.bag.fetch(force=True) # should be valid again self.bag.validate() self.assertEqual(len(self.bag.compare_fetch_with_fs()), 0, 'complete')