Skip to content

Commit 6f0ae02

Browse files
committed
Minimal implementation of fetching entries of fetch.txt, LibraryOfCongress#118
1 parent 8a8263e commit 6f0ae02

File tree

3 files changed

+73
-3
lines changed

3 files changed

+73
-3
lines changed

.gitignore

+6-1
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,10 @@ bench-data
33
build
44
dist
55
MANIFEST
6-
bagit.egg-info
76
.idea
7+
test.log
8+
*.egg-info
9+
.eggs
10+
*.egg
11+
.tox
12+
locale/**/*.mo

bagit.py

+40-2
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
import signal
1515
import sys
1616
import tempfile
17+
import urllib
1718
import unicodedata
1819
import warnings
1920
from collections import defaultdict
@@ -23,9 +24,12 @@
2324

2425
from pkg_resources import DistributionNotFound, get_distribution
2526

26-
try:
27+
# pylint: disable=no-name-in-module, import-error, wrong-import-position
28+
if sys.version_info >= (3,):
2729
from urllib.parse import urlparse
28-
except ImportError:
30+
from urllib.request import urlopen, FancyURLopener
31+
else:
32+
from urllib import urlopen, FancyURLopener
2933
from urlparse import urlparse
3034

3135

@@ -582,6 +586,37 @@ def files_to_be_fetched(self):
582586
for url, file_size, filename in self.fetch_entries():
583587
yield filename
584588

589+
def fetch_files_to_be_fetched(self):
590+
"""
591+
Fetches files from the fetch.txt
592+
"""
593+
urllib._urlopener = BagFetcherURLOpener # pylint: disable=protected-access
594+
for url, expected_size, filename in self.fetch_entries():
595+
expected_size = int(expected_size) # FIXME should be int in the first place
596+
if filename in self.payload_files():
597+
LOGGER.info(_("File already fetched: %s"), filename)
598+
continue
599+
resp = urlopen(url)
600+
headers = resp.info()
601+
if "content-length" not in headers:
602+
LOGGER.warning(_("Server sent no content-length for <%s>"), url)
603+
else:
604+
content_length = int(headers['content-length'])
605+
if content_length != expected_size:
606+
raise BagError(_("Inconsistent size of %s: Expected %s but Content-Length is %s") % (filename, expected_size, content_length))
607+
with open(join(self.path, filename), 'wb') as out:
608+
read = 0
609+
while True:
610+
block = resp.read(1024 * 8)
611+
if not block:
612+
break
613+
read += len(block)
614+
out.write(block)
615+
if read != expected_size:
616+
raise BagError(_("Inconsistent size of %s: Expected %s but received %s") % (filename, expected_size, read))
617+
LOGGER.info(_("Fetched %s from %s"), filename, url)
618+
619+
585620
def has_oxum(self):
586621
return "Payload-Oxum" in self.info
587622

@@ -767,6 +802,7 @@ def validate_fetch(self):
767802
# well formed:
768803
parsed_url = urlparse(url)
769804

805+
# ensure url is a remote URL, not file://
770806
if not all((parsed_url.scheme, parsed_url.netloc)):
771807
raise BagError(_("Malformed URL in fetch.txt: %s") % url)
772808

@@ -937,6 +973,8 @@ def _path_is_dangerous(self, path):
937973
common = os.path.commonprefix((bag_path, real_path))
938974
return not (common == bag_path)
939975

976+
class BagFetcherURLOpener(FancyURLopener):
977+
version = "bagit.py/%s (Python/%s)" % (VERSION, sys.version)
940978

941979
class BagError(Exception):
942980
pass

test.py

+27
Original file line numberDiff line numberDiff line change
@@ -1099,6 +1099,33 @@ def test_fetch_malformed_url(self):
10991099

11001100
self.assertEqual(expected_msg, str(cm.exception))
11011101

1102+
# FIXME: Won't work since file:// URLs are rejected
1103+
# def test_fetching_payload_file(self):
1104+
# with open(j(self.tmpdir, "mock_data"), "w") as mock_data:
1105+
# print("Lorem ipsum dolor sit", file=mock_data)
1106+
# with open(j(self.tmpdir, "fetch.txt"), "w") as fetch_txt:
1107+
# print("file://%s 21 data/mock_data" % j(self.tmpdir, "mock_data"), file=fetch_txt)
1108+
# self.bag.save(manifests=True)
1109+
# self.bag.validate_fetch()
1110+
1111+
def test_fetching_payload_file(self):
1112+
test_payload = 'loc/2478433644_2839c5e8b8_o_d.jpg'
1113+
with open(j(self.tmpdir, "fetch.txt"), "w") as fetch_txt:
1114+
print("https://github.com/LibraryOfCongress/bagit-python/raw/master/test-data/%s %s data/%s" % (
1115+
test_payload, 139367, test_payload), file=fetch_txt)
1116+
self.bag.save(manifests=True)
1117+
# should be valid
1118+
self.bag.validate()
1119+
# now delete the payload, should be invalid
1120+
os.unlink(j(self.tmpdir, "data", test_payload))
1121+
self.assertEqual(len(self.bag.compare_fetch_with_fs()), 1, '1 file to fetch')
1122+
with self.assertRaises(bagit.BagError):
1123+
self.bag.validate()
1124+
# re-fetch it
1125+
self.bag.fetch_files_to_be_fetched()
1126+
# should be valid again
1127+
self.bag.validate()
1128+
self.assertEqual(len(self.bag.compare_fetch_with_fs()), 0, 'complete')
11021129

11031130
class TestUtils(unittest.TestCase):
11041131
def setUp(self):

0 commit comments

Comments
 (0)