Skip to content

Commit

Permalink
opendatacube#43 Use multiprocessing for YAML download
Browse files Browse the repository at this point in the history
Signed-off-by: whatnick <[email protected]>
  • Loading branch information
whatnick committed Apr 20, 2020
1 parent bfe706f commit bb76cfb
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 18 deletions.
34 changes: 18 additions & 16 deletions libs/thredds/odc/thredds/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from thredds_crawler.crawl import Crawl
import requests
from urllib.parse import urlparse
from multiprocessing.dummy import Pool as ThreadPool


def thredds_find_glob(
Expand Down Expand Up @@ -36,26 +37,23 @@ def thredds_find_glob(
return urls


def download_yamls(yaml_urls: list) -> list:
def download_yamls(yaml_urls: list, workers : int = 8) -> list:
"""Download all YAML's in a list of URL's and generate
Arguments:
yaml_urls {list} -- List of URL's to download YAML's from
workers {int} -- Number of workers to use for Thredds Downloading
Returns:
list -- tuples of contents and filenames
"""
# TODO: Make this parallel with Asyncio or Multi-processing
yaml_collection = []
for url in yaml_urls:
try:
yaml_collection.append(_download(url))
except Exception as e:
# Stash errors
yaml_collection.append((None, None, e))

return yaml_collection
# use a threadpool to download from thredds
pool = ThreadPool(workers)
yamls = pool.map(_download, yaml_urls)
pool.close()
pool.join()

return yamls

def _download(url: str) -> tuple:
"""Internal method to download YAML's from thredds via requests
Expand All @@ -71,8 +69,12 @@ def _download(url: str) -> tuple:
"""
parsed_uri = urlparse(url)
target_filename = url[len(parsed_uri.scheme + "://") :]
resp = requests.get(url)
if resp.status_code == 200:
return (resp.content, target_filename, None)
else:
raise Exception("Yaml not found")
try:
resp = requests.get(url)
if resp.status_code == 200:
return (resp.content, target_filename, None)
else:
return(None, None, "Yaml not found")
except Exception as e:
return(None, None, "Thredds Failed")

5 changes: 3 additions & 2 deletions libs/thredds/tests/test_thredds.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,12 @@ def test_download_yaml():
"""
test_urls = [
"http://dapds00.nci.org.au/thredds/fileServer/if87/2018-11-29/S2A_OPER_MSI_ARD_TL_EPAE_20181129T012952_A017945_T56LLM_N02.07/ARD-METADATA.yaml",
"http://dapds00.nci.org.au/thredds/fileServer/if87/2028-11-29/S2A_OPER_MSI_ARD_TL_EPAE_20281129T012952_A017945_T56LLM_N02.07/ARD-METADATA.yaml"
"http://dapds00.nci.org.au/thredds/fileServer/if87/2028-11-29/S2A_OPER_MSI_ARD_TL_EPAE_20281129T012952_A017945_T56LLM_N02.07/ARD-METADATA.yaml",
"http://downtime00.nci.org.au/thredds/fileServer/if87/2018-11-29/S2A_OPER_MSI_ARD_TL_EPAE_20181129T012952_A017945_T56LLM_N02.07/ARD-METADATA.yaml"
]
results = download_yamls(test_urls)
assert results
assert len(results) == 2
assert len(results) == 3
print(results)
assert results[0][0] is not None
assert results[1][0] is None

0 comments on commit bb76cfb

Please sign in to comment.