From 5d0f767247dab1224918a0f6c3b60a8d96dae227 Mon Sep 17 00:00:00 2001 From: benjamindonnachie <83379521+benjamindonnachie@users.noreply.github.com> Date: Sun, 16 Mar 2025 15:16:10 +0000 Subject: [PATCH 1/9] Potential revision to optimise convert_byte_size I've been optimising a lot of Python code recently and offer this as a potential alternative to convert_byte_size(). Removing the nested if.. elif... statements and making it easier to add additional si prefixes in the future if desired. --- src/neonutilities/aop_download.py | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/src/neonutilities/aop_download.py b/src/neonutilities/aop_download.py index 38d5832..1e8f062 100644 --- a/src/neonutilities/aop_download.py +++ b/src/neonutilities/aop_download.py @@ -98,23 +98,17 @@ def convert_byte_size(size_bytes): >>> convert_byte_size(4000000000000) '4.0 TB' """ - if 10**3 < size_bytes < 10**6: - size_kb = round(size_bytes/(10**3), 2) - size_read = f'{size_kb} KB' - elif 10**6 < size_bytes < 10**9: - size_mb = round(size_bytes/(10**6), 1) - size_read = f'{size_mb} MB' - # print('Download size:', size_read) - elif 10**9 < size_bytes < 10**12: - size_gb = round(size_bytes/(10**9), 1) - size_read = f'{size_gb} GB' - # print('Download size:', size_read) - else: - size_tb = round(size_bytes/(10**12), 1) - size_read = f'{size_tb} TB' - # print('Download size:', size_read) - return size_read - + si_prefix = [[12, 'T'], + [ 9, 'G'], + [ 6, 'M'], + [ 3, 'K'], + [ 0, '' ]] + + for si_row in si_prefix: + if (size_bytes >= 10 ** si_row[0]): + break + + return f'{(size_bytes / 10 ** si_row[0]):.0f} {si_row[1]}B' # %% From 0fc29435e89880ad390bad5533d633a7b2a05d21 Mon Sep 17 00:00:00 2001 From: Bridget Hass Date: Wed, 9 Apr 2025 13:11:24 -0600 Subject: [PATCH 2/9] update convert_byte_size based on PyOpenSci PR 14, update to use 1024 instead of 1000, consolidate repeated code in aop_download.py and metadata_helpers.py --- src/neonutilities/aop_download.py | 54 ++----------------- .../helper_mods/metadata_helpers.py | 37 ++++++------- 2 files changed, 19 insertions(+), 72 deletions(-) diff --git a/src/neonutilities/aop_download.py b/src/neonutilities/aop_download.py index 1e8f062..cf70f74 100644 --- a/src/neonutilities/aop_download.py +++ b/src/neonutilities/aop_download.py @@ -31,6 +31,7 @@ from . import __resources__ from .helper_mods.api_helpers import get_api from .helper_mods.api_helpers import download_file +from .helper_mods.metadata_helpers import convert_byte_size from .get_issue_log import get_issue_log from .citation import get_citation @@ -62,54 +63,6 @@ def check_token(response): logging.info( 'API token was not recognized. Public rate limit applied.\n') - -def convert_byte_size(size_bytes): - """ - This function converts the file size in bytes to a more readable format. - It converts bytes to Kilobytes (KB), Megabytes (MB), Gigabytes (GB), or Terabytes (TB) - depending on the size of the input. - - Parameters - -------- - size_bytes: int or float - The file size in bytes. It should be a non-negative number. - - Returns - -------- - str - A string that represents the file size in a more readable format. - The format includes the size number followed by the size unit (KB, MB, GB, or TB). - - Raises - -------- - None - - Examples - -------- - >>> convert_byte_size(5000) - '5.0 KB' - - >>> convert_byte_size(2000000) - '2.0 MB' - - >>> convert_byte_size(3000000000) - '3.0 GB' - - >>> convert_byte_size(4000000000000) - '4.0 TB' -""" - si_prefix = [[12, 'T'], - [ 9, 'G'], - [ 6, 'M'], - [ 3, 'K'], - [ 0, '' ]] - - for si_row in si_prefix: - if (size_bytes >= 10 ** si_row[0]): - break - - return f'{(size_bytes / 10 ** si_row[0]):.0f} {si_row[1]}B' - # %% @@ -785,12 +738,11 @@ def by_file_aop(dpid, # if 'PROVISIONAL' in releases and not include_provisional: if include_provisional: - # print provisional included message - print("Provisional data are included. To exclude provisional data, use input parameter include_provisional=False.") + # log provisional included message logging.info( "Provisional data are included. To exclude provisional data, use input parameter include_provisional=False.") else: - # print provisional not included message and filter to the released data + # log provisional not included message and filter to the released data # logging.info( # "Provisional data are not included. To download provisional data, use input parameter include_provisional=True.") file_url_df = file_url_df[file_url_df['release'] != 'PROVISIONAL'] diff --git a/src/neonutilities/helper_mods/metadata_helpers.py b/src/neonutilities/helper_mods/metadata_helpers.py index fe2e5cc..64a652c 100644 --- a/src/neonutilities/helper_mods/metadata_helpers.py +++ b/src/neonutilities/helper_mods/metadata_helpers.py @@ -51,7 +51,7 @@ def convert_byte_size(size_bytes): """ This function converts the file size in bytes to a more readable format. It converts bytes to Kilobytes (KB), Megabytes (MB), Gigabytes (GB), or Terabytes (TB) - depending on the size of the input. + depending on the size of the input. This uses 1024 bytes as the base for conversion. Parameters -------- @@ -71,30 +71,25 @@ def convert_byte_size(size_bytes): Examples -------- >>> convert_byte_size(5000) - '5.0 KB' + '4.9 KB' >>> convert_byte_size(2000000) - '2.0 MB' + '1.9 MB' >>> convert_byte_size(3000000000) - '3.0 GB' + '2.8 GB' >>> convert_byte_size(4000000000000) - '4.0 TB' + '3.6 TB' """ - if 10**3 < size_bytes < 10**6: - size_kb = round(size_bytes/(10**3), 2) - size_read = f'{size_kb} KB' - elif 10**6 < size_bytes < 10**9: - size_mb = round(size_bytes/(10**6), 1) - size_read = f'{size_mb} MB' - # print('Download size:', size_read) - elif 10**9 < size_bytes < 10**12: - size_gb = round(size_bytes/(10**9), 1) - size_read = f'{size_gb} GB' - # print('Download size:', size_read) - else: - size_tb = round(size_bytes/(10**12), 1) - size_read = f'{size_tb} TB' - # print('Download size:', size_read) - return size_read + si_prefix = [[40, 'T'], + [30, 'G'], + [20, 'M'], + [10, 'K'], + [ 0, '' ]] + + for si_row in si_prefix: + if (size_bytes >= 2 ** si_row[0]): + break + + return f'{(size_bytes / 2 ** si_row[0]):.1f} {si_row[1]}B' From 63adf2ddb5642c134cc7699cc7df16305c415a4d Mon Sep 17 00:00:00 2001 From: Bridget Hass Date: Wed, 9 Apr 2025 13:22:48 -0600 Subject: [PATCH 3/9] remove reference to private testing repository --- tests/test_aop_download.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_aop_download.py b/tests/test_aop_download.py index 43d3752..5f62c26 100644 --- a/tests/test_aop_download.py +++ b/tests/test_aop_download.py @@ -13,8 +13,7 @@ These mainly test the functions' output messages for invalid inputs, collocated sites, data not found, and provisional availability scenarios -More complete integration tests are included in the nu_python_testing repository. -https://github.com/NEONScience/nu-python-testing +More complete integration tests are included in the NEONScience/nu_python_testing repository, which is currently private. Notes: - These tests require an internet connection to run, as they are actually making API calls From 17f3215687e2dc305df0b88b6a546cea8f231b58 Mon Sep 17 00:00:00 2001 From: benjamindonnachie <83379521+benjamindonnachie@users.noreply.github.com> Date: Sun, 16 Mar 2025 15:16:10 +0000 Subject: [PATCH 4/9] Potential revision to optimise convert_byte_size I've been optimising a lot of Python code recently and offer this as a potential alternative to convert_byte_size(). Removing the nested if.. elif... statements and making it easier to add additional si prefixes in the future if desired. --- src/neonutilities/aop_download.py | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/src/neonutilities/aop_download.py b/src/neonutilities/aop_download.py index c4e2595..f8a1605 100644 --- a/src/neonutilities/aop_download.py +++ b/src/neonutilities/aop_download.py @@ -98,23 +98,17 @@ def convert_byte_size(size_bytes): >>> convert_byte_size(4000000000000) '4.0 TB' """ - if 10**3 < size_bytes < 10**6: - size_kb = round(size_bytes/(10**3), 2) - size_read = f'{size_kb} KB' - elif 10**6 < size_bytes < 10**9: - size_mb = round(size_bytes/(10**6), 1) - size_read = f'{size_mb} MB' - # print('Download size:', size_read) - elif 10**9 < size_bytes < 10**12: - size_gb = round(size_bytes/(10**9), 1) - size_read = f'{size_gb} GB' - # print('Download size:', size_read) - else: - size_tb = round(size_bytes/(10**12), 1) - size_read = f'{size_tb} TB' - # print('Download size:', size_read) - return size_read - + si_prefix = [[12, 'T'], + [ 9, 'G'], + [ 6, 'M'], + [ 3, 'K'], + [ 0, '' ]] + + for si_row in si_prefix: + if (size_bytes >= 10 ** si_row[0]): + break + + return f'{(size_bytes / 10 ** si_row[0]):.0f} {si_row[1]}B' # %% From 56fab02aa20969ab690d3c802fa9ed1336e80796 Mon Sep 17 00:00:00 2001 From: Bridget Hass Date: Wed, 9 Apr 2025 13:11:24 -0600 Subject: [PATCH 5/9] update convert_byte_size based on PyOpenSci PR 14, update to use 1024 instead of 1000, consolidate repeated code in aop_download.py and metadata_helpers.py --- src/neonutilities/aop_download.py | 54 ++----------------- .../helper_mods/metadata_helpers.py | 37 ++++++------- 2 files changed, 19 insertions(+), 72 deletions(-) diff --git a/src/neonutilities/aop_download.py b/src/neonutilities/aop_download.py index f8a1605..e35194c 100644 --- a/src/neonutilities/aop_download.py +++ b/src/neonutilities/aop_download.py @@ -31,6 +31,7 @@ from . import __resources__ from .helper_mods.api_helpers import get_api from .helper_mods.api_helpers import download_file +from .helper_mods.metadata_helpers import convert_byte_size from .get_issue_log import get_issue_log from .citation import get_citation @@ -62,54 +63,6 @@ def check_token(response): logging.info( 'API token was not recognized. Public rate limit applied.\n') - -def convert_byte_size(size_bytes): - """ - This function converts the file size in bytes to a more readable format. - It converts bytes to Kilobytes (KB), Megabytes (MB), Gigabytes (GB), or Terabytes (TB) - depending on the size of the input. - - Parameters - -------- - size_bytes: int or float - The file size in bytes. It should be a non-negative number. - - Returns - -------- - str - A string that represents the file size in a more readable format. - The format includes the size number followed by the size unit (KB, MB, GB, or TB). - - Raises - -------- - None - - Examples - -------- - >>> convert_byte_size(5000) - '5.0 KB' - - >>> convert_byte_size(2000000) - '2.0 MB' - - >>> convert_byte_size(3000000000) - '3.0 GB' - - >>> convert_byte_size(4000000000000) - '4.0 TB' -""" - si_prefix = [[12, 'T'], - [ 9, 'G'], - [ 6, 'M'], - [ 3, 'K'], - [ 0, '' ]] - - for si_row in si_prefix: - if (size_bytes >= 10 ** si_row[0]): - break - - return f'{(size_bytes / 10 ** si_row[0]):.0f} {si_row[1]}B' - # %% @@ -806,12 +759,11 @@ def by_file_aop(dpid, # if 'PROVISIONAL' in releases and not include_provisional: if include_provisional: - # print provisional included message - print("Provisional data are included. To exclude provisional data, use input parameter include_provisional=False.") + # log provisional included message logging.info( "Provisional data are included. To exclude provisional data, use input parameter include_provisional=False.") else: - # print provisional not included message and filter to the released data + # log provisional not included message and filter to the released data # logging.info( # "Provisional data are not included. To download provisional data, use input parameter include_provisional=True.") file_url_df = file_url_df[file_url_df['release'] != 'PROVISIONAL'] diff --git a/src/neonutilities/helper_mods/metadata_helpers.py b/src/neonutilities/helper_mods/metadata_helpers.py index fe2e5cc..64a652c 100644 --- a/src/neonutilities/helper_mods/metadata_helpers.py +++ b/src/neonutilities/helper_mods/metadata_helpers.py @@ -51,7 +51,7 @@ def convert_byte_size(size_bytes): """ This function converts the file size in bytes to a more readable format. It converts bytes to Kilobytes (KB), Megabytes (MB), Gigabytes (GB), or Terabytes (TB) - depending on the size of the input. + depending on the size of the input. This uses 1024 bytes as the base for conversion. Parameters -------- @@ -71,30 +71,25 @@ def convert_byte_size(size_bytes): Examples -------- >>> convert_byte_size(5000) - '5.0 KB' + '4.9 KB' >>> convert_byte_size(2000000) - '2.0 MB' + '1.9 MB' >>> convert_byte_size(3000000000) - '3.0 GB' + '2.8 GB' >>> convert_byte_size(4000000000000) - '4.0 TB' + '3.6 TB' """ - if 10**3 < size_bytes < 10**6: - size_kb = round(size_bytes/(10**3), 2) - size_read = f'{size_kb} KB' - elif 10**6 < size_bytes < 10**9: - size_mb = round(size_bytes/(10**6), 1) - size_read = f'{size_mb} MB' - # print('Download size:', size_read) - elif 10**9 < size_bytes < 10**12: - size_gb = round(size_bytes/(10**9), 1) - size_read = f'{size_gb} GB' - # print('Download size:', size_read) - else: - size_tb = round(size_bytes/(10**12), 1) - size_read = f'{size_tb} TB' - # print('Download size:', size_read) - return size_read + si_prefix = [[40, 'T'], + [30, 'G'], + [20, 'M'], + [10, 'K'], + [ 0, '' ]] + + for si_row in si_prefix: + if (size_bytes >= 2 ** si_row[0]): + break + + return f'{(size_bytes / 2 ** si_row[0]):.1f} {si_row[1]}B' From e4becead04664c2bd64379892dd17ed06626df54 Mon Sep 17 00:00:00 2001 From: Bridget Hass Date: Wed, 9 Apr 2025 13:22:48 -0600 Subject: [PATCH 6/9] remove reference to private testing repository --- tests/test_aop_download.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_aop_download.py b/tests/test_aop_download.py index 43d3752..5f62c26 100644 --- a/tests/test_aop_download.py +++ b/tests/test_aop_download.py @@ -13,8 +13,7 @@ These mainly test the functions' output messages for invalid inputs, collocated sites, data not found, and provisional availability scenarios -More complete integration tests are included in the nu_python_testing repository. -https://github.com/NEONScience/nu-python-testing +More complete integration tests are included in the NEONScience/nu_python_testing repository, which is currently private. Notes: - These tests require an internet connection to run, as they are actually making API calls From 6cae8f1a3acdf15828d0d192d5066e7b83f4bde0 Mon Sep 17 00:00:00 2001 From: Bridget Hass Date: Wed, 9 Apr 2025 13:53:42 -0600 Subject: [PATCH 7/9] add test script for testing the metadata helper functions, currently only tests convert_byte_size --- tests/test_metadata_helpers.py | 38 ++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 tests/test_metadata_helpers.py diff --git a/tests/test_metadata_helpers.py b/tests/test_metadata_helpers.py new file mode 100644 index 0000000..c674529 --- /dev/null +++ b/tests/test_metadata_helpers.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Apr 9 01:34:00 2025 + +@author: bhass + +Unit tests for metadata_helpers.py +""" + +import unittest +from src.neonutilities.helper_mods.metadata_helpers import convert_byte_size + +class TestConvertByteSize(unittest.TestCase): + def test_convert_byte_size(self): + """ + Test the convert_byte_size function with various byte sizes. + """ + # Test bytes + self.assertEqual(convert_byte_size(500), "500.0 B") + + # Test kilobytes + self.assertEqual(convert_byte_size(1024), "1.0 KB") + self.assertEqual(convert_byte_size(1536), "1.5 KB") + + # Test megabytes + self.assertEqual(convert_byte_size(1048576), "1.0 MB") + self.assertEqual(convert_byte_size(1572864), "1.5 MB") + + # Test gigabytes + self.assertEqual(convert_byte_size(1073741824), "1.0 GB") + self.assertEqual(convert_byte_size(1610612736), "1.5 GB") + + # Test terabytes + self.assertEqual(convert_byte_size(1099511627776), "1.0 TB") + self.assertEqual(convert_byte_size(1649267441664), "1.5 TB") + +if __name__ == "__main__": + unittest.main() \ No newline at end of file From 968c9afc61fcd2487dcd2bb6bd45f49df0a38b46 Mon Sep 17 00:00:00 2001 From: Bridget Hass Date: Wed, 9 Apr 2025 14:04:47 -0600 Subject: [PATCH 8/9] update file download sizes to reflect new 1024 conversion factor, which matches the data portal --- tests/test_aop_download.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_aop_download.py b/tests/test_aop_download.py index 5f62c26..f4791cf 100644 --- a/tests/test_aop_download.py +++ b/tests/test_aop_download.py @@ -161,7 +161,7 @@ def test_check_download_size_message(self, input_mock): result = by_file_aop(dpid=self.dpid, site=self.site, year=self.year) # Check that the function asked for confirmation to download and prints expected message. input_mock.assert_called_once_with( - 'Continuing will download 128 files totaling approximately 97.7 MB. Do you want to proceed? (y/n) ') + 'Continuing will download 128 files totaling approximately 93.1 MB. Do you want to proceed? (y/n) ') # Check that the function halted the download self.assertEqual(result, None) @@ -313,7 +313,7 @@ def test_check_download_size_message(self, input_mock): year=self.year, easting=self.easting, northing=self.northing) # Check that the function asked for confirmation to download and prints expected message. input_mock.assert_called_once_with( - 'Continuing will download 7 files totaling approximately 4.0 MB. Do you want to proceed? (y/n) ') + 'Continuing will download 7 files totaling approximately 3.9 MB. Do you want to proceed? (y/n) ') # Check that the function halted the download self.assertEqual(result, None) From a9a751c0fac768ef78a91df6b60af6b0ec225a28 Mon Sep 17 00:00:00 2001 From: Bridget Hass Date: Wed, 9 Apr 2025 17:11:00 -0600 Subject: [PATCH 9/9] re-format using black, add .flake8 config file flake8 config file set to ignore long lines, line break before/after binary operators, and import warnings in init files --- .flake8 | 5 + examples/nu_example.py | 51 +- src/neonutilities/__init__.py | 7 +- src/neonutilities/aop_download.py | 560 ++++++----- src/neonutilities/citation.py | 24 +- src/neonutilities/get_issue_log.py | 60 +- src/neonutilities/helper_mods/api_helpers.py | 346 ++++--- .../helper_mods/metadata_helpers.py | 17 +- src/neonutilities/read_table_neon.py | 91 +- src/neonutilities/tabular_download.py | 369 +++++--- src/neonutilities/unzip_and_stack.py | 874 +++++++++++------- tests/test_aop_download.py | 415 ++++++--- tests/test_get_citation.py | 4 +- tests/test_metadata_helpers.py | 12 +- tests/test_zips_by_product.py | 70 +- 15 files changed, 1798 insertions(+), 1107 deletions(-) create mode 100644 .flake8 diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..a39fd8a --- /dev/null +++ b/.flake8 @@ -0,0 +1,5 @@ +[flake8] +max-line-length = 88 +ignore = E203, E501, W503, W504 +per-file-ignores = + __init__.py: F401 \ No newline at end of file diff --git a/examples/nu_example.py b/examples/nu_example.py index 5782c53..7bad619 100644 --- a/examples/nu_example.py +++ b/examples/nu_example.py @@ -2,26 +2,45 @@ import os # download some veg structure data -veg = nu.load_by_product(dpid='DP1.10098.001', site=['WREF','RMNP'], - startdate='2022-01', enddate='2023-12', - include_provisional=True, check_size=False, - token=os.environ.get('NEON_TOKEN')) +veg = nu.load_by_product( + dpid="DP1.10098.001", + site=["WREF", "RMNP"], + startdate="2022-01", + enddate="2023-12", + include_provisional=True, + check_size=False, + token=os.environ.get("NEON_TOKEN"), +) # see what data tables were returned veg.keys() # download 30-minute PAR data -par = nu.load_by_product(dpid='DP1.00024.001', site='RMNP', - startdate='2023-06', enddate='2023-07', - timeindex=30, package='expanded', - include_provisional=True, check_size=False, - token=os.environ.get('NEON_TOKEN')) +par = nu.load_by_product( + dpid="DP1.00024.001", + site="RMNP", + startdate="2023-06", + enddate="2023-07", + timeindex=30, + package="expanded", + include_provisional=True, + check_size=False, + token=os.environ.get("NEON_TOKEN"), +) # download CHM tiles covering the veg structure plots at WREF -pppy = veg['vst_perplotperyear'] -east = pppy['easting'].to_list() -north = pppy['northing'].to_list() +pppy = veg["vst_perplotperyear"] +east = pppy["easting"].to_list() +north = pppy["northing"].to_list() -nu.by_tile_aop(dpid='DP3.30015.001', site='WREF', year=2023, - easting=east, northing=north, buffer=20, - include_provisional=True, check_size=False, - savepath='INSERT FILE PATH', token=os.environ.get('NEON_TOKEN')) +nu.by_tile_aop( + dpid="DP3.30015.001", + site="WREF", + year=2023, + easting=east, + northing=north, + buffer=20, + include_provisional=True, + check_size=False, + savepath="INSERT FILE PATH", + token=os.environ.get("NEON_TOKEN"), +) diff --git a/src/neonutilities/__init__.py b/src/neonutilities/__init__.py index 8544df7..089f5fc 100644 --- a/src/neonutilities/__init__.py +++ b/src/neonutilities/__init__.py @@ -1,5 +1,10 @@ from .citation import get_citation -from .aop_download import by_file_aop, by_tile_aop, list_available_dates, get_aop_tile_extents +from .aop_download import ( + by_file_aop, + by_tile_aop, + list_available_dates, + get_aop_tile_extents, +) from .tabular_download import zips_by_product from .get_issue_log import get_issue_log from .read_table_neon import read_table_neon diff --git a/src/neonutilities/aop_download.py b/src/neonutilities/aop_download.py index e35194c..051885c 100644 --- a/src/neonutilities/aop_download.py +++ b/src/neonutilities/aop_download.py @@ -20,13 +20,10 @@ from time import sleep import os import re -import platform import pandas as pd import numpy as np import logging from tqdm import tqdm -import requests -import importlib import importlib_resources from . import __resources__ from .helper_mods.api_helpers import get_api @@ -36,7 +33,7 @@ from .citation import get_citation # display the log info messages, only showing the message (otherwise it would print INFO:root:'message') -logging.basicConfig(level=logging.INFO, format='%(message)s') +logging.basicConfig(level=logging.INFO, format="%(message)s") # check that token was used @@ -59,9 +56,12 @@ def check_token(response): """ - if 'x-ratelimit-limit' in response.headers and response.headers['x-ratelimit-limit'] == '200': - logging.info( - 'API token was not recognized. Public rate limit applied.\n') + if ( + "x-ratelimit-limit" in response.headers + and response.headers["x-ratelimit-limit"] == "200" + ): + logging.info("API token was not recognized. Public rate limit applied.\n") + # %% @@ -111,25 +111,26 @@ def get_file_urls(urls, token=None): response = get_api(api_url=url, token=token) if response is None: logging.info( - "Data file retrieval failed. Check NEON data portal for outage alerts.") + "Data file retrieval failed. Check NEON data portal for outage alerts." + ) # get release info - release = response.json()['data']['release'] + release = response.json()["data"]["release"] releases.append(release) - file_url_dict = response.json()['data']['files'] + file_url_dict = response.json()["data"]["files"] file_url_df = pd.DataFrame(data=file_url_dict) - file_url_df['release'] = release + file_url_df["release"] = release # drop md5 and crc32 columns, which are all NaNs - file_url_df.drop(columns=['md5', 'crc32'], inplace=True) + file_url_df.drop(columns=["md5", "crc32"], inplace=True) # append the new dataframe to the existing one - all_file_url_df = pd.concat( - [all_file_url_df, file_url_df], ignore_index=True) + all_file_url_df = pd.concat([all_file_url_df, file_url_df], ignore_index=True) return all_file_url_df, list(set(releases)) + # %% @@ -168,46 +169,50 @@ def get_shared_flights(site): 'shared_flights.csv' located in the '__resources__' directory. """ - shared_flights_file = (importlib_resources.files( - __resources__) / 'shared_flights.csv') + shared_flights_file = ( + importlib_resources.files(__resources__) / "shared_flights.csv" + ) shared_flights_df = pd.read_csv(shared_flights_file) - shared_flights_dict = shared_flights_df.set_index( - ['site'])['flightSite'].to_dict() + shared_flights_dict = shared_flights_df.set_index(["site"])["flightSite"].to_dict() if site in shared_flights_dict: flightSite = shared_flights_dict[site] - if site in ['TREE', 'CHEQ', 'KONA', 'DCFS']: + if site in ["TREE", "CHEQ", "KONA", "DCFS"]: logging.info( - f'{site} is part of the flight box for {flightSite}. Downloading data from {flightSite}.') + f"{site} is part of the flight box for {flightSite}. Downloading data from {flightSite}." + ) else: logging.info( - f'{site} is an aquatic site and is sometimes included in the flight box for {flightSite}. Aquatic sites are not always included in the flight coverage every year.\nDownloading data from {flightSite}. Check data to confirm coverage of {site}.') + f"{site} is an aquatic site and is sometimes included in the flight box for {flightSite}. Aquatic sites are not always included in the flight coverage every year.\nDownloading data from {flightSite}. Check data to confirm coverage of {site}." + ) site = flightSite return site def get_neon_sites(): - """This function gets a list of the valid NEON sites from the - neon_field_site_metadata.csv file for validation, and adds the AOP CHEQ + """This function gets a list of the valid NEON sites from the + neon_field_site_metadata.csv file for validation, and adds the AOP CHEQ site, which is an AOP site name and is associated with STEI & TREE." """ - neon_sites_file = (importlib_resources.files( - __resources__) / "neon_field_site_metadata_20250214.csv") + neon_sites_file = ( + importlib_resources.files(__resources__) + / "neon_field_site_metadata_20250214.csv" + ) neon_sites_df = pd.read_csv(neon_sites_file) - neon_sites_list = list(neon_sites_df['field_site_id']) - neon_sites_list.append('CHEQ') + neon_sites_list = list(neon_sites_df["field_site_id"]) + neon_sites_list.append("CHEQ") return neon_sites_list def get_data_product_name(dpid): - dpid_api_response = get_api( - f'https://data.neonscience.org/api/v0/products/{dpid}') - product_name = dpid_api_response.json()['data']['productName'] + dpid_api_response = get_api(f"https://data.neonscience.org/api/v0/products/{dpid}") + product_name = dpid_api_response.json()["data"]["productName"] return product_name + # %% functions to validate inputs for by_file_aop and by_tile_aop @@ -215,7 +220,9 @@ def validate_dpid(dpid): dpid_pattern = "DP[1-4]{1}.[0-9]{5}.00[1-2]{1}" if not re.fullmatch(dpid_pattern, dpid): raise ValueError( - f'{dpid} is not a properly formatted data product ID. The correct format is DP#.#####.00#') + f"{dpid} is not a properly formatted data product ID. The correct format is DP#.#####.00#" + ) + # TODO: change to read a list of valid AOP DPIDs # def validate_aop_dpid(dpid): @@ -226,31 +233,62 @@ def validate_dpid(dpid): # List of valid AOP data product IDs (2024 +, includes the .002 spectrometer revisions) -valid_aop_dpids = ['DP1.30001.001', # L1 waveform lidar - # L1 & L3 discrete lidar - 'DP1.30003.001', 'DP3.30015.001', 'DP3.30024.001', 'DP3.30025.001', - # L1 & L3 camera - 'DP1.30010.001', 'DP3.30010.001', - # L1 spectrometer (.001 & .002) - 'DP1.30006.001', 'DP1.30006.002', 'DP1.30008.001', - # L2 spectrometer, .001 - 'DP2.30011.001', 'DP2.30012.001', 'DP2.30014.001', 'DP2.30019.001', 'DP2.30026.001', - # L2 spectrometer, .002 - 'DP2.30011.002', 'DP2.30012.002', 'DP2.30014.002', 'DP2.30019.002', 'DP2.30026.002', - # L3 spectrometer, .001 - 'DP3.30006.001', 'DP3.30011.001', 'DP3.30012.001', 'DP3.30014.001', 'DP3.30019.001', 'DP3.30026.001', - # L3 spectrometer, .002 - 'DP3.30006.002', 'DP3.30011.002', 'DP3.30012.002', 'DP3.30014.002', 'DP3.30019.002', 'DP3.30026.002'] +valid_aop_dpids = [ + "DP1.30001.001", # L1 waveform lidar + # L1 & L3 discrete lidar + "DP1.30003.001", + "DP3.30015.001", + "DP3.30024.001", + "DP3.30025.001", + # L1 & L3 camera + "DP1.30010.001", + "DP3.30010.001", + # L1 spectrometer (.001 & .002) + "DP1.30006.001", + "DP1.30006.002", + "DP1.30008.001", + # L2 spectrometer, .001 + "DP2.30011.001", + "DP2.30012.001", + "DP2.30014.001", + "DP2.30019.001", + "DP2.30026.001", + # L2 spectrometer, .002 + "DP2.30011.002", + "DP2.30012.002", + "DP2.30014.002", + "DP2.30019.002", + "DP2.30026.002", + # L3 spectrometer, .001 + "DP3.30006.001", + "DP3.30011.001", + "DP3.30012.001", + "DP3.30014.001", + "DP3.30019.001", + "DP3.30026.001", + # L3 spectrometer, .002 + "DP3.30006.002", + "DP3.30011.002", + "DP3.30012.002", + "DP3.30014.002", + "DP3.30019.002", + "DP3.30026.002", +] # List of valid Level 3 AOP data product IDs -valid_aop_l3_dpids = [ - dpid for dpid in valid_aop_dpids if dpid.startswith('DP3')] +valid_aop_l3_dpids = [dpid for dpid in valid_aop_dpids if dpid.startswith("DP3")] # List of suspended AOP data product IDs (will need to change once these data products become active again) -suspended_aop_dpids = ['DP2.30018.001', 'DP3.30018.001', # canopy nitrogen - 'DP2.30020.001', 'DP3.30020.001', # canopy xanthophyll cycle - 'DP2.30022.001', 'DP3.30022.001', # canopy lignin - 'DP2.30016.001', 'DP3.30016.001'] # total biomass map +suspended_aop_dpids = [ + "DP2.30018.001", + "DP3.30018.001", # canopy nitrogen + "DP2.30020.001", + "DP3.30020.001", # canopy xanthophyll cycle + "DP2.30022.001", + "DP3.30022.001", # canopy lignin + "DP2.30016.001", + "DP3.30016.001", +] # total biomass map # request with suspended data (no data available) # eg. https://data.neonscience.org/api/v0/products/DP3.30016.001 @@ -276,19 +314,22 @@ def validate_aop_dpid(dpid): # Check if the dpid matches the pattern if not re.fullmatch(aop_dpid_pattern, dpid): raise ValueError( - f'{dpid} is not a valid AOP data product ID. AOP data products follow the format DP#.300##.00#.') + f"{dpid} is not a valid AOP data product ID. AOP data products follow the format DP#.300##.00#." + ) # Check if the dpid is in the list of suspended AOP dpids if dpid in suspended_aop_dpids: raise ValueError( - f'{dpid} has been suspended and is not currently available, see https://data.neonscience.org/data-products/{dpid} for more details.') # ' Valid AOP IDs are: {", ".join(valid_aop_dpids)}.') + f"{dpid} has been suspended and is not currently available, see https://data.neonscience.org/data-products/{dpid} for more details." + ) # ' Valid AOP IDs are: {", ".join(valid_aop_dpids)}.') # Check if the dpid is in the list of valid AOP dpids if dpid not in valid_aop_dpids: valid_aop_dpids.sort() valid_aop_dpids_string = "\n".join(valid_aop_dpids) raise ValueError( - f'{dpid} is not a valid AOP data product ID. Valid AOP IDs are listed below:\n{valid_aop_dpids_string}') + f"{dpid} is not a valid AOP data product ID. Valid AOP IDs are listed below:\n{valid_aop_dpids_string}" + ) def validate_aop_l3_dpid(dpid): @@ -302,9 +343,10 @@ def validate_aop_l3_dpid(dpid): - ValueError: If the dpid does not start with DP3 or is not in the list of valid Level 3 AOP data product IDs. """ # Check if the dpid starts with DP3 - if not dpid.startswith('DP3'): + if not dpid.startswith("DP3"): raise ValueError( - f'{dpid} is not a valid Level 3 (L3) AOP data product ID. Level 3 AOP products follow the format DP3.300##.00#') + f"{dpid} is not a valid Level 3 (L3) AOP data product ID. Level 3 AOP products follow the format DP3.300##.00#" + ) # Check if the dpid is in the list of valid AOP dpids if dpid not in valid_aop_l3_dpids: @@ -316,22 +358,25 @@ def validate_aop_l3_dpid(dpid): # f'{key}: {value}' for key, value in dpid_dict.items()) raise ValueError( - f'{dpid} is not a valid Level 3 (L3) AOP data product ID. Valid L3 AOP IDs are listed below:\n{valid_aop_l3_dpids_string}') + f"{dpid} is not a valid Level 3 (L3) AOP data product ID. Valid L3 AOP IDs are listed below:\n{valid_aop_l3_dpids_string}" + ) # below prints out the corresponding data product names for each ID. # f'{dpid} is not a valid Level 3 (L3) AOP data product ID. Valid L3 AOP products are listed below.\n{formatted_dpid_dict}') def check_field_spectra_dpid(dpid): - if dpid == 'DP1.30012.001': + if dpid == "DP1.30012.001": raise ValueError( - f'{dpid} is the Field spectral data product, which is published as tabular data. Use zipsByProduct() or loadByProduct() to download these data.') + f"{dpid} is the Field spectral data product, which is published as tabular data. Use zipsByProduct() or loadByProduct() to download these data." + ) def validate_site_format(site): site_pattern = "[A-Z]{4}" if not re.fullmatch(site_pattern, site): raise ValueError( - f'{site} is an invalid site format. A four-letter NEON site code is required. NEON site codes can be found here: https://www.neonscience.org/field-sites/explore-field-sites') + f"{site} is an invalid site format. A four-letter NEON site code is required. NEON site codes can be found here: https://www.neonscience.org/field-sites/explore-field-sites" + ) def validate_neon_site(site): @@ -339,7 +384,8 @@ def validate_neon_site(site): if site not in neon_sites: raise ValueError( - f'{site} is not a valid NEON site code. A complete list of NEON site codes can be found here: https://www.neonscience.org/field-sites/explore-field-sites') + f"{site} is not a valid NEON site code. A complete list of NEON site codes can be found here: https://www.neonscience.org/field-sites/explore-field-sites" + ) def validate_year(year): @@ -347,20 +393,21 @@ def validate_year(year): year_pattern = "20[1-9][0-9]" if not re.fullmatch(year_pattern, year): raise ValueError( - f'{year} is an invalid year. Year is required in the format "2017" or 2017, eg. AOP data are available from 2013 to present.') + f'{year} is an invalid year. Year is required in the format "2017" or 2017, eg. AOP data are available from 2013 to present.' + ) def check_aop_dpid(response_dict, dpid): - if response_dict['data']['productScienceTeamAbbr'] != 'AOP': - logging.info( - f'{dpid} is not a remote sensing product. Use zipsByProduct()') + if response_dict["data"]["productScienceTeamAbbr"] != "AOP": + logging.info(f"{dpid} is not a remote sensing product. Use zipsByProduct()") return def get_site_year_urls(response_dict, site, year): site_info = next( - item for item in response_dict['data']['siteCodes'] if item["siteCode"] == site) - site_urls = site_info['availableDataUrls'] + item for item in response_dict["data"]["siteCodes"] if item["siteCode"] == site + ) + site_urls = site_info["availableDataUrls"] site_year_urls = [url for url in site_urls if str(year) in url] return site_year_urls @@ -370,31 +417,26 @@ def get_site_year_urls(response_dict, site, year): def list_available_dates(dpid, site): """ - list_available_dates displays the available releases and dates for a given product and site - -------- - Inputs: - dpid: the data product code (eg. 'DP3.30015.001' - CHM) - site: the 4-digit NEON site code (eg. 'JORN') - -------- - Returns: - prints the Release Tag (or PROVISIONAL) and the corresponding available dates (YYYY-MM) for each tag - - Examples: + list_available_dates displays the available releases and dates for a given product and site + -------- + Inputs: + dpid: the data product code (eg. 'DP3.30015.001' - CHM) + site: the 4-digit NEON site code (eg. 'JORN') + -------- + Returns: + prints the Release Tag (or PROVISIONAL) and the corresponding available dates (YYYY-MM) for each tag -------- - >>> list_available_dates('DP3.30015.001','JORN') - RELEASE-2025 Available Dates: 2017-08, 2018-08, 2019-08, 2021-08, 2022-09 - - >>> list_available_dates('DP3.30015.001','HOPB') - PROVISIONAL Available Dates: 2024-09 - RELEASE-2025 Available Dates: 2016-08, 2017-08, 2019-08, 2022-08 - - >>> list_available_dates('DP1.10098.001','HOPB') - ValueError: There are no data available for the data product DP1.10098.001 at the site HOPB. + Usage: + -------- + >>> list_available_dates('DP3.30015.001','JORN') + RELEASE-2025 Available Dates: 2017-08, 2018-08, 2019-08, 2021-08, 2022-09 - Created on Feb 17 2025 - - @author: Bridget Hass + >>> list_available_dates('DP3.30015.001','HOPB') + PROVISIONAL Available Dates: 2024-09 + RELEASE-2025 Available Dates: 2016-08, 2017-08, 2019-08, 2022-08 + >>> list_available_dates('DP1.10098.001','HOPB') + ValueError: There are no data available for the data product DP1.10098.001 at the site HOPB. """ product_url = "http://data.neonscience.org/api/v0/products/" + dpid response = get_api(api_url=product_url) # add input for token? @@ -409,23 +451,25 @@ def list_available_dates(dpid, site): # raise value error and print message if site is not a valid NEON site validate_neon_site(site) -# get available releases & months: - for i in range(len(response.json()['data']['siteCodes'])): - if site in response.json()['data']['siteCodes'][i]['siteCode']: - available_releases = response.json( - )['data']['siteCodes'][i]['availableReleases'] + # get available releases & months: + for i in range(len(response.json()["data"]["siteCodes"])): + if site in response.json()["data"]["siteCodes"][i]["siteCode"]: + available_releases = response.json()["data"]["siteCodes"][i][ + "availableReleases" + ] -# display available release tags (including provisional) and dates for each tag + # display available release tags (including provisional) and dates for each tag try: for entry in available_releases: - release = entry['release'] - available_months = ', '.join(entry['availableMonths']) + release = entry["release"] + available_months = ", ".join(entry["availableMonths"]) logging.info(f"{release} Available Dates: {available_months}") - except UnboundLocalError as e: + except UnboundLocalError: # if the available_releases variable doesn't exist, this error will show up: # UnboundLocalError: local variable 'available_releases' referenced before assignment raise ValueError( - f'There are no data available for the data product {dpid} at the site {site}.') + f"There are no data available for the data product {dpid} at the site {site}." + ) def get_tile_bounds(file_url_df, all_bounds=False): @@ -457,7 +501,7 @@ def get_tile_bounds(file_url_df, all_bounds=False): """ # Regular expression to match UTM coordinates in the format 'xxxxxx_yyyyyyy' - utm_pattern = re.compile(r'(\d{6})_(\d{7})') + utm_pattern = re.compile(r"(\d{6})_(\d{7})") # lists to store x and y coordinates x_coords = [] @@ -466,11 +510,12 @@ def get_tile_bounds(file_url_df, all_bounds=False): # filter out rows where 'name' ends with '.tif' , '.h5' or '.laz' # this will exclude shapefiles, just in case they don't match - data_df = file_url_df[file_url_df['name'].str.endswith( - ('.tif', '.h5', '.laz', '.zip'))] + data_df = file_url_df[ + file_url_df["name"].str.endswith((".tif", ".h5", ".laz", ".zip")) + ] # Iterate over each name in the DataFrame - for name in data_df['name']: + for name in data_df["name"]: match = utm_pattern.search(name) if match: x, y = match.groups() @@ -493,10 +538,7 @@ def get_tile_bounds(file_url_df, all_bounds=False): return x_bounds, y_bounds, sorted_coords -def get_aop_tile_extents(dpid, - site, - year, - token=None): +def get_aop_tile_extents(dpid, site, year, token=None): """ This function displays the tile extents for a given product, site, and year and returns a complete list of the UTM coordinates @@ -533,10 +575,6 @@ def get_aop_tile_extents(dpid, # This returns a list of the UTM x,y extent for all CHM tiles at the site MCRA collected in 2021. # It also displays the minimum and maximum UTM Easting and Northing (x and y) values for this product - site -year. - Created on Feb 17 2025 - - @author: Bridget Hass - """ # raise value error and print message if dpid isn't formatted as expected @@ -560,20 +598,19 @@ def get_aop_tile_extents(dpid, validate_year(year) # if token is an empty string, set to None - if token == '': + if token == "": token = None # query the products endpoint for the product requested - response = get_api( - "http://data.neonscience.org/api/v0/products/" + dpid, token) + response = get_api("http://data.neonscience.org/api/v0/products/" + dpid, token) # exit function if response is None (eg. if no internet connection) if response is None: - logging.info('No response from NEON API. Check internet connection') + logging.info("No response from NEON API. Check internet connection") return # check that token was used - if token and 'x-ratelimit-limit' in response.headers: + if token and "x-ratelimit-limit" in response.headers: check_token(response) # if response.headers['x-ratelimit-limit'] == '200': # print('API token was not recognized. Public rate limit applied.\n') @@ -593,7 +630,8 @@ def get_aop_tile_extents(dpid, # error message if nothing is available if len(site_year_urls) == 0: logging.info( - f"There are no {dpid} data available at the site {site} in {year}. \nTo display available dates for a given data product and site, use the function list_available_dates().") + f"There are no {dpid} data available at the site {site} in {year}. \nTo display available dates for a given data product and site, use the function list_available_dates()." + ) return # get file url dataframe for the available month urls @@ -608,8 +646,8 @@ def get_aop_tile_extents(dpid, # corner_tiles = get_corner_tiles(file_url_df) x_bounds, y_bounds, sorted_coords = get_tile_bounds(file_url_df) - logging.info(f'Easting Bounds: {x_bounds}') - logging.info(f'Northing Bounds: {y_bounds}') + logging.info(f"Easting Bounds: {x_bounds}") + logging.info(f"Northing Bounds: {y_bounds}") # return the sorted_coords list return sorted_coords @@ -618,14 +656,16 @@ def get_aop_tile_extents(dpid, # %% -def by_file_aop(dpid, - site, - year, - include_provisional=False, - check_size=True, - savepath=None, - chunk_size=1024, - token=None): +def by_file_aop( + dpid, + site, + year, + include_provisional=False, + check_size=True, + savepath=None, + chunk_size=1024, + token=None, +): """ This function queries the NEON API for AOP data by site, year, and product, and downloads all files found, preserving the original folder structure. It downloads files serially to @@ -676,18 +716,6 @@ def by_file_aop(dpid, -------- The function creates a folder in the 'savepath' directory, containing all AOP files meeting the query criteria. If 'savepath' is not provided, data are downloaded to the working directory. - - Windows Path Length Limitations: - When using this function to download files from the NEON data portal, you may encounter path length limitations - on Windows systems. Windows has a default maximum path length of 260 characters, which can cause download - functions to fail if this limit is exceeded. If the file path exceeds 260 characters on a Windows system, - the package will issue a warning.If you see this warning and no files are downloaded, either change your - working or savepath directory to be closer to the root directory, or enable long paths on Windows. You can - choose to ignore or filter these warnings using Python's warnings module if you prefer not to see them. - - Created on Feb 28 2024 - - @author: Bridget Hass """ # raise value error and print message if dpid isn't formatted as expected @@ -711,20 +739,19 @@ def by_file_aop(dpid, validate_year(year) # if token is an empty string, set to None - if token == '': + if token == "": token = None # query the products endpoint for the product requested - response = get_api( - "http://data.neonscience.org/api/v0/products/" + dpid, token) + response = get_api("http://data.neonscience.org/api/v0/products/" + dpid, token) # exit function if response is None (eg. if no internet connection) if response is None: - logging.info('No response from NEON API. Check internet connection') + logging.info("No response from NEON API. Check internet connection") return # check that token was used - if token and 'x-ratelimit-limit' in response.headers: + if token and "x-ratelimit-limit" in response.headers: check_token(response) # if response.headers['x-ratelimit-limit'] == '200': # print('API token was not recognized. Public rate limit applied.\n') @@ -744,7 +771,8 @@ def by_file_aop(dpid, # error message if nothing is available if len(site_year_urls) == 0: logging.info( - f"There are no {dpid} data available at the site {site} in {year}.\nTo display available dates for a given data product and site, use the function list_available_dates().") + f"There are no {dpid} data available at the site {site} in {year}.\nTo display available dates for a given data product and site, use the function list_available_dates()." + ) # print("There are no data available at the selected site and year.") return @@ -761,31 +789,39 @@ def by_file_aop(dpid, if include_provisional: # log provisional included message logging.info( - "Provisional data are included. To exclude provisional data, use input parameter include_provisional=False.") + "Provisional data are included. To exclude provisional data, use input parameter include_provisional=False." + ) else: # log provisional not included message and filter to the released data # logging.info( # "Provisional data are not included. To download provisional data, use input parameter include_provisional=True.") - file_url_df = file_url_df[file_url_df['release'] != 'PROVISIONAL'] + file_url_df = file_url_df[file_url_df["release"] != "PROVISIONAL"] if len(file_url_df) == 0: logging.info( - "Provisional data are not included. To download provisional data, use input parameter include_provisional=True.") + "Provisional data are not included. To download provisional data, use input parameter include_provisional=True." + ) num_files = len(file_url_df) if num_files == 0: logging.info( - "No data files found. Available data may all be provisional. To download provisional data, use input parameter include_provisional=True.") + "No data files found. Available data may all be provisional. To download provisional data, use input parameter include_provisional=True." + ) return # get the total size of all the files found - download_size_bytes = file_url_df['size'].sum() + download_size_bytes = file_url_df["size"].sum() # print(f'download size, bytes: {download_size_bytes}') download_size = convert_byte_size(download_size_bytes) # print(f'download size: {download_size}') # report data download size and ask user if they want to proceed if check_size: - if input(f"Continuing will download {num_files} files totaling approximately {download_size}. Do you want to proceed? (y/n) ") != "y": + if ( + input( + f"Continuing will download {num_files} files totaling approximately {download_size}. Do you want to proceed? (y/n) " + ) + != "y" + ): print("Download halted.") return @@ -797,13 +833,13 @@ def by_file_aop(dpid, os.makedirs(download_path, exist_ok=True) # serially download all files, with progress bar - files = list(file_url_df['url']) - print( - f"Downloading {num_files} files totaling approximately {download_size}\n") + files = list(file_url_df["url"]) + print(f"Downloading {num_files} files totaling approximately {download_size}\n") sleep(1) for file in tqdm(files): - download_file(url=file, savepath=download_path, - chunk_size=chunk_size, token=token) + download_file( + url=file, savepath=download_path, chunk_size=chunk_size, token=token + ) # download issue log table ilog = get_issue_log(dpid=dpid, token=None) @@ -814,8 +850,11 @@ def by_file_aop(dpid, if "PROVISIONAL" in releases: try: cit = get_citation(dpid=dpid, release="PROVISIONAL") - with open(f"{download_path}/citation_{dpid}_PROVISIONAL.txt", - mode="w+", encoding="utf-8") as f: + with open( + f"{download_path}/citation_{dpid}_PROVISIONAL.txt", + mode="w+", + encoding="utf-8", + ) as f: f.write(cit) except Exception: pass @@ -827,29 +866,35 @@ def by_file_aop(dpid, if len(rel) == 1: try: cit = get_citation(dpid=dpid, release=rel[0]) - with open(f"{download_path}/citation_{dpid}_{rel[0]}.txt", - mode="w+", encoding="utf-8") as f: + with open( + f"{download_path}/citation_{dpid}_{rel[0]}.txt", + mode="w+", + encoding="utf-8", + ) as f: f.write(cit) except Exception: pass return + # %% -def by_tile_aop(dpid, - site, - year, - easting, - northing, - buffer=0, - include_provisional=False, - check_size=True, - savepath=None, - chunk_size=1024, - token=None, - verbose=False): +def by_tile_aop( + dpid, + site, + year, + easting, + northing, + buffer=0, + include_provisional=False, + check_size=True, + savepath=None, + chunk_size=1024, + token=None, + verbose=False, +): """ This function queries the NEON API for AOP data by site, year, product, and UTM coordinates, and downloads all files found, preserving the original @@ -912,23 +957,6 @@ def by_tile_aop(dpid, # This will download any tiles overlapping the specified UTM coordinates for # 2021 canopy height model data from McRae Creek to the './test_download' directory. - Notes - -------- - The function creates a folder in the 'savepath' directory, containing all AOP files meeting the query criteria. - If 'savepath' is not provided, data are downloaded to the working directory. - - Windows Path Length Limitations: - When using this function to download files from the NEON data portal, you may encounter path length limitations - on Windows systems. Windows has a default maximum path length of 260 characters, which can cause download - functions to fail if this limit is exceeded. If the file path exceeds 260 characters on a Windows system, - the package will issue a warning.If you see this warning and no files are downloaded, either change your - working or savepath directory to be closer to the root directory, or enable long paths on Windows. You can - choose to ignore or filter these warnings using Python's warnings module if you prefer not to see them. - - Created on Feb 28 2024 - - @author: Bridget Hass - """ # raise value error and print message if dpid isn't formatted as expected @@ -962,14 +990,16 @@ def by_tile_aop(dpid, easting = [float(e) for e in easting] except ValueError as e: logging.info( - 'The easting is invalid, this is required as a number or numeric list format, eg. 732000 or [732000, 733000]') + "The easting is invalid, this is required as a number or numeric list format, eg. 732000 or [732000, 733000]" + ) print(e) try: northing = [float(e) for e in northing] except ValueError as e: logging.info( - 'The northing is invalid, this is required as a number or numeric list format, eg. 4713000 or [4713000, 4714000]') + "The northing is invalid, this is required as a number or numeric list format, eg. 4713000 or [4713000, 4714000]" + ) print(e) # link easting and northing coordinates - as a list of tuples ? @@ -982,32 +1012,31 @@ def by_tile_aop(dpid, if len(easting) != len(northing): logging.info( - 'Easting and northing list lengths do not match, and/or contain null values. Cannot identify paired coordinates.') + "Easting and northing list lengths do not match, and/or contain null values. Cannot identify paired coordinates." + ) return # if token is an empty string, set to None - if token == '': + if token == "": token = None # query the products endpoint for the product requested - response = get_api( - "http://data.neonscience.org/api/v0/products/" + dpid, token) + response = get_api("http://data.neonscience.org/api/v0/products/" + dpid, token) # exit function if response is None (eg. if no internet connection) if response is None: - logging.info('No response from NEON API. Check internet connection') + logging.info("No response from NEON API. Check internet connection") return -# # check that token was used - if token and 'x-ratelimit-limit' in response.headers: + # # check that token was used + if token and "x-ratelimit-limit" in response.headers: check_token(response) # get the request response dictionary response_dict = response.json() # error message if dpid is not an AOP data product - if response_dict['data']['productScienceTeamAbbr'] != 'AOP': - print( - f'{dpid} is not a remote sensing product. Use zipsByProduct()') + if response_dict["data"]["productScienceTeamAbbr"] != "AOP": + print(f"{dpid} is not a remote sensing product. Use zipsByProduct()") return # replace collocated site with the site name it's published under @@ -1019,7 +1048,8 @@ def by_tile_aop(dpid, # error message if nothing is available if len(site_year_urls) == 0: logging.info( - f"There are no {dpid} data available at the site {site} in {year}.\nTo display available dates for a given data product and site, use the function list_available_dates().") + f"There are no {dpid} data available at the site {site} in {year}.\nTo display available dates for a given data product and site, use the function list_available_dates()." + ) return # get file url dataframe for the available month url(s) @@ -1034,31 +1064,36 @@ def by_tile_aop(dpid, if include_provisional: # print provisional included message logging.info( - "Provisional data are included. To exclude provisional data, use input parameter include_provisional=False.") + "Provisional data are included. To exclude provisional data, use input parameter include_provisional=False." + ) else: # print provisional not included message - file_url_df = file_url_df[file_url_df['release'] != 'PROVISIONAL'] + file_url_df = file_url_df[file_url_df["release"] != "PROVISIONAL"] logging.info( - "Provisional data are not included. To download provisional data, use input parameter include_provisional=True.") + "Provisional data are not included. To download provisional data, use input parameter include_provisional=True." + ) # get the number of files in the dataframe after filtering for provisional data, if there are no files to download, return num_files = len(file_url_df) if num_files == 0: logging.info( - "No data files found. Available data may all be provisional. To download provisional data, use input parameter include_provisional=True.") + "No data files found. Available data may all be provisional. To download provisional data, use input parameter include_provisional=True." + ) return # BLAN edge-case - contains plots in 18N and plots in 17N; flight data are all in 17N # convert easting & northing coordinates for Blandy (BLAN) to 17N - if site == 'BLAN' and any([e <= 250000.0 for e in easting]): + if site == "BLAN" and any([e <= 250000.0 for e in easting]): # check that pyproj is installed try: from pyproj import Proj, CRS + # importlib.import_module('pyproj') except ImportError: logging.info( - "Package pyproj is required for this function to work at the BLAN site. Install and re-try") + "Package pyproj is required for this function to work at the BLAN site. Install and re-try" + ) return crs17 = CRS.from_epsg(32617) # utm zone 17N @@ -1067,16 +1102,17 @@ def by_tile_aop(dpid, proj18to17 = Proj.from_crs(crs_from=crs18, crs_to=crs17) # link easting and northing coordinates so it's easier to parse the zone for each - coord_tuples = [(easting[i], northing[i]) - for i in range(0, len(easting))] + coord_tuples = [(easting[i], northing[i]) for i in range(0, len(easting))] coords17 = [(e, n) for (e, n) in coord_tuples if e > 250000.0] coords18 = [(e, n) for (e, n) in coord_tuples if e <= 250000.0] # apply the projection transformation from 18N to 17N for each coordinate tuple - coords18_reprojected = [proj18to17.transform( - coords18[i][0], coords18[i][1]) for i in range(len(coords18))] + coords18_reprojected = [ + proj18to17.transform(coords18[i][0], coords18[i][1]) + for i in range(len(coords18)) + ] coords17.extend(coords18_reprojected) @@ -1084,16 +1120,18 @@ def by_tile_aop(dpid, easting = [c[0] for c in coords17] northing = [c[1] for c in coords17] - logging.info('Blandy (BLAN) plots include two UTM zones, flight data ' - 'are all in 17N. Coordinates in UTM zone 18N have been ' - 'converted to 17N to download the correct tiles. You ' - 'will need to make the same conversion to connect ' - 'airborne to ground data.') + logging.info( + "Blandy (BLAN) plots include two UTM zones, flight data " + "are all in 17N. Coordinates in UTM zone 18N have been " + "converted to 17N to download the correct tiles. You " + "will need to make the same conversion to connect " + "airborne to ground data." + ) # function to round down to the nearest 1000, in order to determine # lower left coordinate of AOP tile to be downloaded def round_down1000(val): - return int(np.floor(val/1000)*1000) + return int(np.floor(val / 1000) * 1000) # function to get the coordinates of the tiles including the buffer def get_buffer_coords(easting, northing, buffer): @@ -1103,8 +1141,12 @@ def get_buffer_coords(easting, northing, buffer): buffer_max_e = easting + buffer buffer_max_n = northing + buffer - new_coords = [(buffer_min_e, buffer_min_n), (buffer_min_e, buffer_max_n), - (buffer_max_e, buffer_min_n), (buffer_max_e, buffer_max_n)] + new_coords = [ + (buffer_min_e, buffer_min_n), + (buffer_min_e, buffer_max_n), + (buffer_max_e, buffer_min_n), + (buffer_max_e, buffer_max_n), + ] return new_coords @@ -1116,52 +1158,55 @@ def get_buffer_coords(easting, northing, buffer): buffer_coords.extend(get_buffer_coords(e, n, buffer)) buffer_coords_rounded = [ - (round_down1000(c[0]), round_down1000(c[1])) for c in buffer_coords] + (round_down1000(c[0]), round_down1000(c[1])) for c in buffer_coords + ] # remove duplicate coordinates buffer_coords_set = list(set(buffer_coords_rounded)) buffer_coords_set.sort() - utm17_eastings_str = ', '.join([str(round(e, 2)) for e in easting]) - utm17_northings_str = ', '.join(str(round(n, 2)) for n in northing) + utm17_eastings_str = ", ".join([str(round(e, 2)) for e in easting]) + utm17_northings_str = ", ".join(str(round(n, 2)) for n in northing) if verbose: - logging.info(f'UTM 17N Easting(s): {utm17_eastings_str}') - logging.info(f'UTM 17N Northing(s): {utm17_northings_str}') + logging.info(f"UTM 17N Easting(s): {utm17_eastings_str}") + logging.info(f"UTM 17N Northing(s): {utm17_northings_str}") # logging.info('Buffer:', buffer) - logging.info( - 'UTM (x, y) lower-left coordinates of tiles to be downloaded:') + logging.info("UTM (x, y) lower-left coordinates of tiles to be downloaded:") for coord in buffer_coords_set: logging.info(coord) # create the list of utm "easting_northing" strings that will be used to match to the tile names - coord_strs = ['_'.join([str(c[0]), str(c[1])]) - for c in buffer_coords_set] + coord_strs = ["_".join([str(c[0]), str(c[1])]) for c in buffer_coords_set] # append the .txt file to include the README - IS THIS NEEDED? - coord_strs.append('.txt') + coord_strs.append(".txt") # subset the dataframe to include only the coordinate strings matching coord_strs # if verbose: # print('finding the tiles') - file_url_df_subset = file_url_df[file_url_df['name'].str.contains( - '|'.join(coord_strs))] + file_url_df_subset = file_url_df[ + file_url_df["name"].str.contains("|".join(coord_strs)) + ] - file_url_df_subset2 = file_url_df_subset[~file_url_df_subset['name'].str.endswith( - '.txt')] + file_url_df_subset2 = file_url_df_subset[ + ~file_url_df_subset["name"].str.endswith(".txt") + ] # if any coordinates were not included in the data, print a warning message # Warning: the following coordinates are outside the bounds of site-year: unique_coords_to_download = set( - file_url_df_subset2['name'].str.extract(r'_(\d+_\d+)_')[0]) + file_url_df_subset2["name"].str.extract(r"_(\d+_\d+)_")[0] + ) - coord_strs.remove('.txt') + coord_strs.remove(".txt") # compare two lists: - coords_not_found = list( - set(coord_strs).difference(list(unique_coords_to_download))) + coords_not_found = list(set(coord_strs).difference(list(unique_coords_to_download))) if len(coords_not_found) > 0: - print('Warning, the following coordinates fall outside the bounds of the site, so will not be downloaded:') + print( + "Warning, the following coordinates fall outside the bounds of the site, so will not be downloaded:" + ) for coord in coords_not_found: - print(','.join(coord.split('_'))) + print(",".join(coord.split("_"))) # get the number of files in the dataframe, if there are no files to download, return num_files = len(file_url_df_subset) @@ -1170,13 +1215,18 @@ def get_buffer_coords(easting, northing, buffer): return # get the total size of all the files found - download_size_bytes = file_url_df_subset['size'].sum() + download_size_bytes = file_url_df_subset["size"].sum() download_size = convert_byte_size(download_size_bytes) # ask whether to continue download, depending on size if check_size: - if input(f"Continuing will download {num_files} files totaling approximately {download_size}. Do you want to proceed? (y/n) ") != "y": + if ( + input( + f"Continuing will download {num_files} files totaling approximately {download_size}. Do you want to proceed? (y/n) " + ) + != "y" + ): print("Download halted") return @@ -1189,13 +1239,13 @@ def get_buffer_coords(easting, northing, buffer): os.makedirs(download_path, exist_ok=True) # serially download all files, with progress bar - files = list(file_url_df_subset['url']) - print( - f"Downloading {num_files} files totaling approximately {download_size}\n") + files = list(file_url_df_subset["url"]) + print(f"Downloading {num_files} files totaling approximately {download_size}\n") sleep(1) for file in tqdm(files): - download_file(url=file, savepath=download_path, - chunk_size=chunk_size, token=token) + download_file( + url=file, savepath=download_path, chunk_size=chunk_size, token=token + ) # download issue log table ilog = get_issue_log(dpid=dpid, token=None) @@ -1206,8 +1256,11 @@ def get_buffer_coords(easting, northing, buffer): if "PROVISIONAL" in releases: try: cit = get_citation(dpid=dpid, release="PROVISIONAL") - with open(f"{download_path}/citation_{dpid}_PROVISIONAL.txt", - mode="w+", encoding="utf-8") as f: + with open( + f"{download_path}/citation_{dpid}_PROVISIONAL.txt", + mode="w+", + encoding="utf-8", + ) as f: f.write(cit) except Exception: pass @@ -1219,8 +1272,11 @@ def get_buffer_coords(easting, northing, buffer): if len(rel) == 1: try: cit = get_citation(dpid=dpid, release=rel[0]) - with open(f"{download_path}/citation_{dpid}_{rel[0]}.txt", - mode="w+", encoding="utf-8") as f: + with open( + f"{download_path}/citation_{dpid}_{rel[0]}.txt", + mode="w+", + encoding="utf-8", + ) as f: f.write(cit) except Exception: pass diff --git a/src/neonutilities/citation.py b/src/neonutilities/citation.py index 71e9cf7..080aeaa 100644 --- a/src/neonutilities/citation.py +++ b/src/neonutilities/citation.py @@ -6,15 +6,15 @@ def get_citation(dpid, release): """ - Use the DOI Foundation API to get BibTex-formatted citations for NEON data, - or use a template to generate a BibTex citation for provisional data. + Use the DOI Foundation API to get BibTex-formatted citations for NEON data, + or use a template to generate a BibTex citation for provisional data. Helper function to download and stacking functions. Parameters ---------- dpid: str The data product ID of the data to be cited - + release: str The data release to be cited. Can be provisional. @@ -24,7 +24,7 @@ def get_citation(dpid, release): Example ------- - Get the citation for Breeding landbird point counts (DP1.10003.001), + Get the citation for Breeding landbird point counts (DP1.10003.001), RELEASE-2023 >>> cit = get_citation(dpid="DP1.10003.001", release="RELEASE-2023") @@ -35,14 +35,12 @@ def get_citation(dpid, release): """ if release == "PROVISIONAL": - # construct citation from template citI = "@misc{DPID/provisional,\n doi = {},\n url = {https://data.neonscience.org/data-products/DPID},\n author = {{National Ecological Observatory Network (NEON)}},\n language = {en},\n title = {NAME (DPID)},\n publisher = {National Ecological Observatory Network (NEON)},\n year = {YEAR}\n}" citDP = citI.replace("DPID", dpid) citY = citDP.replace("YEAR", str(datetime.now().year)) - nm_req = requests.get("https://data.neonscience.org/api/v0/products/" + - dpid) + nm_req = requests.get("https://data.neonscience.org/api/v0/products/" + dpid) nm_str = nm_req.json() nm = nm_str["data"]["productName"] @@ -50,21 +48,17 @@ def get_citation(dpid, release): return cit else: - # get DOI from NEON API, then citation from DOI API - pr_req = requests.get("https://data.neonscience.org/api/v0/products/" + - dpid) + pr_req = requests.get("https://data.neonscience.org/api/v0/products/" + dpid) pr_str = pr_req.json() rels = pr_str["data"]["releases"] relinfo = next((i for i in rels if i["release"] == release), None) - + if relinfo is None: - print("There are no data with dpid=" + dpid + - " and release=" + release) + print("There are no data with dpid=" + dpid + " and release=" + release) return relinfo else: doi = relinfo["productDoi"]["url"] - doi_req = requests.get(doi, - headers={"accept": "application/x-bibtex"}) + doi_req = requests.get(doi, headers={"accept": "application/x-bibtex"}) return doi_req.text diff --git a/src/neonutilities/get_issue_log.py b/src/neonutilities/get_issue_log.py index f84af11..9940b81 100644 --- a/src/neonutilities/get_issue_log.py +++ b/src/neonutilities/get_issue_log.py @@ -18,7 +18,7 @@ import pandas as pd from .helper_mods.api_helpers import get_api -logging.basicConfig(level=logging.INFO, format='%(message)s') +logging.basicConfig(level=logging.INFO, format="%(message)s") # %% functions to validate inputs (should pull these out into another helper module??) @@ -36,7 +36,9 @@ def validate_dpid(dpid): dpid_pattern = "DP[1-4]{1}.[0-9]{5}.00[1-2]{1}" if not re.fullmatch(dpid_pattern, dpid): raise ValueError( - f'{dpid} is not a properly formatted data product ID. The correct format is DP#.#####.00#') + f"{dpid} is not a properly formatted data product ID. The correct format is DP#.#####.00#" + ) + # %% functions to get the change / issue logs @@ -51,17 +53,18 @@ def get_change_log_df(dpid, token=None): Returns: change_log_df: A DataFrame containing the changeLogs for the provided dpid. - columns of the dataframe are: 'id', 'parentIssueID', 'issueDate', - 'resolvedDate', 'dateRangeStart', 'dateRangeEnd', 'locationAffected', + columns of the dataframe are: 'id', 'parentIssueID', 'issueDate', + 'resolvedDate', 'dateRangeStart', 'dateRangeEnd', 'locationAffected', 'issue', 'resolution' """ req = get_api( - api_url=f"https://data.neonscience.org/api/v0/products/{dpid}", token=token) + api_url=f"https://data.neonscience.org/api/v0/products/{dpid}", token=token + ) if req is None: logging.info(f"Error in metadata retrieval for {dpid}. Issue log not found.") return None - all_product_info = pd.json_normalize(req.json()['data']) - change_log_df = pd.DataFrame(all_product_info['changeLogs'][0]) + all_product_info = pd.json_normalize(req.json()["data"]) + change_log_df = pd.DataFrame(all_product_info["changeLogs"][0]) return change_log_df @@ -80,20 +83,37 @@ def get_eddy_issue_log(dpid, token=None): 'dateRangeEnd', 'locationAffected', 'issue', 'resolution' """ - bundle_dps = ["DP1.00007.001", "DP1.00010.001", "DP1.00034.001", "DP1.00035.001", - "DP1.00036.001", "DP1.00037.001", "DP1.00099.001", "DP1.00100.001", - "DP2.00008.001", "DP2.00009.001", "DP2.00024.001", "DP3.00008.001", - "DP3.00009.001", "DP3.00010.001", "DP4.00002.001", "DP4.00007.001", - "DP4.00067.001", "DP4.00137.001", "DP4.00201.001", "DP4.00200.001"] + bundle_dps = [ + "DP1.00007.001", + "DP1.00010.001", + "DP1.00034.001", + "DP1.00035.001", + "DP1.00036.001", + "DP1.00037.001", + "DP1.00099.001", + "DP1.00100.001", + "DP2.00008.001", + "DP2.00009.001", + "DP2.00024.001", + "DP3.00008.001", + "DP3.00009.001", + "DP3.00010.001", + "DP4.00002.001", + "DP4.00007.001", + "DP4.00067.001", + "DP4.00137.001", + "DP4.00201.001", + "DP4.00200.001", + ] eddy_issue_log_list = [] for dpid in bundle_dps: change_log_df = get_change_log_df(dpid, token=token) if change_log_df is not None and not change_log_df.empty: - change_log_df['dpid'] = dpid + change_log_df["dpid"] = dpid eddy_issue_log_list.append(change_log_df) - + eddy_issue_log_df = pd.concat(eddy_issue_log_list, ignore_index=True) return eddy_issue_log_df @@ -106,19 +126,19 @@ def get_issue_log(dpid, token=None): Args: dpid: str The NEON data product ID. - + token: str - User-specific API token from data.neonscience.org user account. See - https://data.neonscience.org/data-api/rate-limiting/ for details about + User-specific API token from data.neonscience.org user account. See + https://data.neonscience.org/data-api/rate-limiting/ for details about API rate limits and user tokens. If omitted, download uses the public rate limit. Returns: issue_log_df: A pandas DataFrame containing the changeLogs for the provided dpid. - columns of the bundled eddy data frame are: 'dpid', 'id', + columns of the bundled eddy data frame are: 'dpid', 'id', 'parentIssueID', 'issueDate', 'resolvedDate', 'dateRangeStart', - 'dateRangeEnd', 'locationAffected', 'issue', 'resolution'; + 'dateRangeEnd', 'locationAffected', 'issue', 'resolution'; all other data products have the same columns minus 'dpid' - + Example ------- Get the issue log for Breeding landbird point counts (DP1.10003.001) diff --git a/src/neonutilities/helper_mods/api_helpers.py b/src/neonutilities/helper_mods/api_helpers.py index 3302b23..0c7cc2e 100644 --- a/src/neonutilities/helper_mods/api_helpers.py +++ b/src/neonutilities/helper_mods/api_helpers.py @@ -12,18 +12,18 @@ import pandas as pd from tqdm import tqdm from .metadata_helpers import get_recent -logging.basicConfig(level=logging.INFO, format='%(message)s') + +logging.basicConfig(level=logging.INFO, format="%(message)s") # Set global user agent -vers = importlib.metadata.version('neonutilities') +vers = importlib.metadata.version("neonutilities") plat = platform.python_version() osplat = platform.platform() usera = f"neonutilities/{vers} Python/{plat} {osplat}" -def get_api(api_url, - token=None): +def get_api(api_url, token=None): """ Accesses the API with options to use the user-specific API token generated within neon.datascience user accounts. @@ -47,18 +47,21 @@ def get_api(api_url, @author: Zachary Nickerson """ + def get_status_code_meaning(status_code): return requests.status_codes._codes[status_code][0] # Check internet connection try: - check_connection = requests.get("https://data.neonscience.org/", - headers={"User-Agent": usera}) + check_connection = requests.get( + "https://data.neonscience.org/", headers={"User-Agent": usera} + ) if check_connection.status_code != 200: status_code = check_connection.status_code status_code_meaning = get_status_code_meaning(status_code) raise ConnectionError( - f"Request failed with status code {status_code}, indicating '{status_code_meaning}'\n") + f"Request failed with status code {status_code}, indicating '{status_code_meaning}'\n" + ) except Exception: # ConnectionError as e raise ConnectionError("Connection error. Cannot access NEON API.\n") @@ -66,34 +69,36 @@ def get_status_code_meaning(status_code): # burst reset time to try again. j = 1 - while (j <= 5): - + while j <= 5: # Try making the request try: # Construct URL either with or without token if token is None: - response = requests.get(api_url, - headers={"accept": "application/json", - "User-Agent": usera}) + response = requests.get( + api_url, headers={"accept": "application/json", "User-Agent": usera} + ) else: response = requests.get( - api_url, headers={"X-API-TOKEN": token, - "accept": "application/json", - "User-Agent": usera}) + api_url, + headers={ + "X-API-TOKEN": token, + "accept": "application/json", + "User-Agent": usera, + }, + ) # Check for successful response if response.status_code == 200: - - if 'x-ratelimit-limit' in response.headers: + if "x-ratelimit-limit" in response.headers: # Retry get request if rate limit is reached - limit_remain = response.headers.get( - 'x-ratelimit-remaining') + limit_remain = response.headers.get("x-ratelimit-remaining") if int(limit_remain) < 1: # Wait for the reset time - time_reset = response.headers.get('x-ratelimit-reset') + time_reset = response.headers.get("x-ratelimit-reset") logging.info( - f"Rate limit reached. Pausing for {time_reset} seconds to reset.\n") + f"Rate limit reached. Pausing for {time_reset} seconds to reset.\n" + ) time.sleep(int(time_reset)) # Increment loop to retry request attempt j += 1 @@ -110,10 +115,10 @@ def get_status_code_meaning(status_code): else: # Return nothing if request failed (status code is not 200) # Print the status code and it's meaning - status_code_meaning = get_status_code_meaning( - response.status_code) + status_code_meaning = get_status_code_meaning(response.status_code) raise ConnectionError( - f"Request failed with status code {response.status_code}, indicating '{status_code_meaning}'\n") + f"Request failed with status code {response.status_code}, indicating '{status_code_meaning}'\n" + ) return response @@ -122,8 +127,7 @@ def get_status_code_meaning(status_code): return None -def get_api_headers(api_url, - token=None): +def get_api_headers(api_url, token=None): """ Accesses the API with options to use the user-specific API token generated within neon.datascience user accounts. @@ -143,53 +147,60 @@ def get_api_headers(api_url, @author: Zachary Nickerson @author: Claire Lunch """ + def get_status_code_meaning(status_code): return requests.status_codes._codes[status_code][0] # Check internet connection try: - check_connection = requests.head("https://data.neonscience.org/", - headers={"User-Agent": usera}) + check_connection = requests.head( + "https://data.neonscience.org/", headers={"User-Agent": usera} + ) if check_connection.status_code != 200: status_code = check_connection.status_code status_code_meaning = get_status_code_meaning(status_code) raise ConnectionError( - f"Request failed with status code {status_code}, indicating '{status_code_meaning}'\n") + f"Request failed with status code {status_code}, indicating '{status_code_meaning}'\n" + ) except Exception: # ConnectionError as e - raise ConnectionError("No internet connection detected. Cannot access NEON API.\n") + raise ConnectionError( + "No internet connection detected. Cannot access NEON API.\n" + ) # Make 5 request attempts. If the rate limit is reached, pause for the # burst reset time to try again. j = 1 - while (j <= 5): - + while j <= 5: # Try making the request try: # Construct URL either with or without token if token is None: - response = requests.head(api_url, - headers={"accept": "application/json", - "User-Agent": usera}) + response = requests.head( + api_url, headers={"accept": "application/json", "User-Agent": usera} + ) else: response = requests.head( - api_url, headers={"X-API-TOKEN": token, - "accept": "application/json", - "User-Agent": usera}) + api_url, + headers={ + "X-API-TOKEN": token, + "accept": "application/json", + "User-Agent": usera, + }, + ) # Check for successful response if response.status_code == 200: - - if 'x-ratelimit-limit' in response.headers: + if "x-ratelimit-limit" in response.headers: # Retry get request if rate limit is reached - limit_remain = response.headers.get( - 'x-ratelimit-remaining') + limit_remain = response.headers.get("x-ratelimit-remaining") if int(limit_remain) < 1: # Wait for the reset time - time_reset = response.headers.get('x-ratelimit-reset') + time_reset = response.headers.get("x-ratelimit-reset") logging.info( - f"Rate limit reached. Pausing for {time_reset} seconds to reset.\n") + f"Rate limit reached. Pausing for {time_reset} seconds to reset.\n" + ) time.sleep(int(time_reset)) # Increment loop to retry request attempt j += 1 @@ -206,24 +217,22 @@ def get_status_code_meaning(status_code): else: # Return nothing if request failed (status code is not 200) # Print the status code and it's meaning - status_code_meaning = get_status_code_meaning( - response.status_code) + status_code_meaning = get_status_code_meaning(response.status_code) raise ConnectionError( - f"Request failed with status code {response.status_code}, indicating '{status_code_meaning}'\n") + f"Request failed with status code {response.status_code}, indicating '{status_code_meaning}'\n" + ) return response except Exception: raise ConnectionError( - "No response. NEON API may be unavailable, check NEON data portal for outage alerts. If the problem persists and can't be traced to an outage alert, check your computer for firewall or other security settings preventing Python from accessing the internet.") + "No response. NEON API may be unavailable, check NEON data portal for outage alerts. If the problem persists and can't be traced to an outage alert, check your computer for firewall or other security settings preventing Python from accessing the internet." + ) -def get_zip_urls(url_set, - package, - release, - include_provisional, - token=None, - progress=True): +def get_zip_urls( + url_set, package, release, include_provisional, token=None, progress=True +): """ Given a set of urls to the data endpoint of the NEON API, returns the set of zip file urls for each site-month package. Internal function, called by zips_by_product(). @@ -255,11 +264,12 @@ def get_zip_urls(url_set, logging.info("Finding available files") for i in tqdm(range(0, len(url_set)), disable=not progress): - # get list of files from data endpoint m_res = get_api(api_url=url_set[i], token=token) if m_res is None: - logging.info("Connection error for a subset of urls. Check outputs for missing data.") + logging.info( + "Connection error for a subset of urls. Check outputs for missing data." + ) return None m_di = m_res.json() @@ -271,29 +281,34 @@ def get_zip_urls(url_set, # if include_provisional=F, exclude provisional if not include_provisional: if m_di["data"]["release"] == "PROVISIONAL": - provflag=True + provflag = True continue # check for no files if "packages" not in list(m_di["data"]): - logging.info(f"No files found for site {m_di['data']['siteCode']} and month {m_di['data']['month']}") + logging.info( + f"No files found for site {m_di['data']['siteCode']} and month {m_di['data']['month']}" + ) continue if len(m_di["data"]["packages"]) == 0: - logging.info(f"No files found for site {m_di['data']['siteCode']} and month {m_di['data']['month']}") + logging.info( + f"No files found for site {m_di['data']['siteCode']} and month {m_di['data']['month']}" + ) continue # if package=expanded, check for expanded. reassign to basic if not found. if package == "expanded": if package not in [p["type"] for p in m_di["data"]["packages"]]: - logging.info(f"No expanded package found for site {m_di['data']['siteCode']} and month {m_di['data']['month']}. Basic package downloaded instead.") + logging.info( + f"No expanded package found for site {m_di['data']['siteCode']} and month {m_di['data']['month']}. Basic package downloaded instead." + ) package = "basic" # get zip file url and file name - zi = [u["url"] for u in m_di["data"]["packages"] if u["type"]==package] + zi = [u["url"] for u in m_di["data"]["packages"] if u["type"] == package] h = get_api_headers(api_url=zi[0], token=token) - fltp = re.sub(pattern='"', repl="", - string=h.headers["content-disposition"]) + fltp = re.sub(pattern='"', repl="", string=h.headers["content-disposition"]) flnmi = re.sub(pattern="inline; filename=", repl="", string=fltp) # get file sizes @@ -311,20 +326,24 @@ def get_zip_urls(url_set, zpfiles = dict(flnm=flnm, z=z, sz=sz, rel=rel) # provisional message - if(provflag): - logging.info("Provisional data were excluded from available files list. To download provisional data, use input parameter include_provisional=True.") - - return(zpfiles) - - -def get_tab_urls(url_set, - package, - release, - include_provisional, - timeindex, - tabl, - token=None, - progress=True): + if provflag: + logging.info( + "Provisional data were excluded from available files list. To download provisional data, use input parameter include_provisional=True." + ) + + return zpfiles + + +def get_tab_urls( + url_set, + package, + release, + include_provisional, + timeindex, + tabl, + token=None, + progress=True, +): """ Given a set of urls to the data endpoint of the NEON API, and averaging interval or table name criteria, returns the set of urls to individual files for each site-month package. Internal function, called by zips_by_product(). @@ -365,7 +384,9 @@ def get_tab_urls(url_set, spr = re.compile("sensor_positions") if timeindex != "all": - tt = re.compile(str(timeindex) + "min|" + str(timeindex) + "_min|science_review_flags") + tt = re.compile( + str(timeindex) + "min|" + str(timeindex) + "_min|science_review_flags" + ) if tabl != "all": tb = re.compile("[.]" + tabl + "[.]") @@ -375,11 +396,12 @@ def get_tab_urls(url_set, logging.info("Finding available files") for i in tqdm(range(0, len(url_set)), disable=not progress): - # get list of files from data endpoint m_res = get_api(api_url=url_set[i], token=token) if m_res is None: - logging.info("Connection error for a subset of urls. Check outputs for missing data.") + logging.info( + "Connection error for a subset of urls. Check outputs for missing data." + ) return None m_di = m_res.json() @@ -391,7 +413,7 @@ def get_tab_urls(url_set, # if include_provisional=F, exclude provisional if not include_provisional: if m_di["data"]["release"] == "PROVISIONAL": - provflag=True + provflag = True continue # subset to package. switch to basic if expanded not available @@ -404,14 +426,15 @@ def get_tab_urls(url_set, # check for no files if len(flsp) == 0: - logging.info(f"No files found for site {m_di['data']['siteCode']} and month {m_di['data']['month']}") + logging.info( + f"No files found for site {m_di['data']['siteCode']} and month {m_di['data']['month']}" + ) continue # get zip file url and file name zi = [u["url"] for u in m_di["data"]["packages"] if u["type"] == package] h = get_api_headers(api_url=zi[0], token=token) - fltp = re.sub(pattern='"', repl="", - string=h.headers["content-disposition"]) + fltp = re.sub(pattern='"', repl="", string=h.headers["content-disposition"]) flpthit = re.sub(pattern="inline; filename=", repl="", string=fltp) flpthi = re.sub(pattern=".zip", repl="/", string=flpthit) @@ -420,9 +443,9 @@ def get_tab_urls(url_set, rdmei = [f for f in m_di["data"]["files"] if rdr.search(f["name"])] spi = [f for f in m_di["data"]["files"] if spr.search(f["name"])] for f in varfi: - f["name"] = flpthi+f["name"] + f["name"] = flpthi + f["name"] for f in rdmei: - f["name"] = flpthi+f["name"] + f["name"] = flpthi + f["name"] varf.append(varfi) rdme.append(rdmei) @@ -433,24 +456,28 @@ def get_tab_urls(url_set, # subset by averaging interval if timeindex != "all": - flnmi = [flpthi+fl["name"] for fl in flsp if tt.search(fl["name"])] + flnmi = [flpthi + fl["name"] for fl in flsp if tt.search(fl["name"])] flszi = [fl["size"] for fl in flsp if tt.search(fl["name"])] zi = [fl["url"] for fl in flsp if tt.search(fl["name"])] # check for no files if len(flnmi) == 0: - logging.info(f"No files found for site {m_di['data']['siteCode']}, month {m_di['data']['month']}, and averaging interval (time index) {timeindex}") + logging.info( + f"No files found for site {m_di['data']['siteCode']}, month {m_di['data']['month']}, and averaging interval (time index) {timeindex}" + ) continue # subset by table if tabl != "all": - flnmi = [flpthi+fl["name"] for fl in flsp if tb.search(fl["name"])] + flnmi = [flpthi + fl["name"] for fl in flsp if tb.search(fl["name"])] flszi = [fl["size"] for fl in flsp if tb.search(fl["name"])] zi = [fl["url"] for fl in flsp if tb.search(fl["name"])] # check for no files if len(flnmi) == 0: - logging.info(f"No files found for site {m_di['data']['siteCode']}, month {m_di['data']['month']}, and table {tabl}") + logging.info( + f"No files found for site {m_di['data']['siteCode']}, month {m_di['data']['month']}, and table {tabl}" + ) continue # return url, file name, file size, and release @@ -466,7 +493,7 @@ def get_tab_urls(url_set, varfl = get_recent(varf, "variables") flnm.append([fl["name"] for fl in varfl]) z.append([fl["url"] for fl in varfl]) - sz.append([fl["size"]for fl in varfl]) + sz.append([fl["size"] for fl in varfl]) except Exception: pass @@ -501,16 +528,15 @@ def get_tab_urls(url_set, tbfiles = dict(flnm=flnm, flpth=flpth, z=z, sz=sz, rel=rel) # provisional message - if(provflag): - logging.info("Provisional data were excluded from available files list. To download provisional data, use input parameter include_provisional=True.") + if provflag: + logging.info( + "Provisional data were excluded from available files list. To download provisional data, use input parameter include_provisional=True." + ) - return(tbfiles) + return tbfiles -def download_urls(url_set, - outpath, - token=None, - progress=True): +def download_urls(url_set, outpath, token=None, progress=True): """ Given a set of urls to NEON data packages or files, downloads the contents of each. Internal function, called by zips_by_product(). @@ -535,10 +561,10 @@ def download_urls(url_set, logging.info("Downloading files") for i in tqdm(range(0, len(url_set["z"])), disable=not progress): - - if len(outpath+url_set["flnm"][i]) > 260 and platform.system() == "Windows": + if len(outpath + url_set["flnm"][i]) > 260 and platform.system() == "Windows": raise OSError( - f'Filepath is {len(outpath+url_set["flnm"][i])} characters long. Filepaths on Windows are limited to 260 characters. Move your working directory closer to the root directory or enable long path support in Windows through the Registry Editor.') + f'Filepath is {len(outpath+url_set["flnm"][i])} characters long. Filepaths on Windows are limited to 260 characters. Move your working directory closer to the root directory or enable long path support in Windows through the Registry Editor.' + ) return else: @@ -547,41 +573,54 @@ def download_urls(url_set, j = 0 while j < 3: try: - with open(outpath+url_set["flnm"][i], "wb") as out_file: - content = requests.get(url_set["z"][i], stream=True, - headers={"accept": "application/json", - "User-Agent": usera}, - timeout=(10, 120)).content + with open(outpath + url_set["flnm"][i], "wb") as out_file: + content = requests.get( + url_set["z"][i], + stream=True, + headers={ + "accept": "application/json", + "User-Agent": usera, + }, + timeout=(10, 120), + ).content out_file.write(content) - j = j+5 + j = j + 5 except Exception as e: logging.info( - f"File {url_set['flnm'][i]} could not be downloaded. Re-attempting.") + f"File {url_set['flnm'][i]} could not be downloaded. Re-attempting." + ) print(e) - j = j+1 + j = j + 1 time.sleep(5) else: j = 0 while j < 3: try: - with open(outpath+url_set["flnm"][i], "wb") as out_file: - content = requests.get(url_set["z"][i], stream=True, - headers={"X-API-TOKEN": token, - "accept": "application/json", - "User-Agent": usera}, - timeout=(10, 120)).content + with open(outpath + url_set["flnm"][i], "wb") as out_file: + content = requests.get( + url_set["z"][i], + stream=True, + headers={ + "X-API-TOKEN": token, + "accept": "application/json", + "User-Agent": usera, + }, + timeout=(10, 120), + ).content out_file.write(content) - j = j+5 + j = j + 5 except Exception as e: logging.info( - f"File {url_set['flnm'][i]} could not be downloaded. Re-attempting.") + f"File {url_set['flnm'][i]} could not be downloaded. Re-attempting." + ) print(e) - j = j+1 + j = j + 1 time.sleep(5) except Exception: logging.info( - f"File {url_set['flnm'][i]} could not be downloaded and was skipped. If this issue persists, check your network connection and check the NEON Data Portal for outage alerts.") + f"File {url_set['flnm'][i]} could not be downloaded and was skipped. If this issue persists, check your network connection and check the NEON Data Portal for outage alerts." + ) pass return None @@ -599,7 +638,7 @@ def download_file(url, savepath, chunk_size=1024, token=None): savepath: str The file location (path) where the file will be downloaded. - chunk_size: + chunk_size: Size in bytes of chunks for chunked download token: str, optional @@ -620,10 +659,10 @@ def download_file(url, savepath, chunk_size=1024, token=None): Notes -------- - The function creates the directory specified by 'savepath' if it does not exist. + The function creates the directory specified by 'savepath' if it does not exist. It also downloads the readme.txt file which contains detailed information about the data package, issue logs, etc. https://storage.googleapis.com/neon-publication/NEON.DOM.SITE.DP3.30015.001/SCBI/20230601T000000--20230701T000000/basic/NEON.D02.SCBI.DP3.30015.001.readme.20240206T001418Z.txt - + The function issues a warning on Windows systems if the full download file path exceeds 260 characters, as the file may not be downloaded due to path length limitations. @author: Bridget Hass @@ -631,14 +670,15 @@ def download_file(url, savepath, chunk_size=1024, token=None): """ pathparts = url.split("/") - file_path = "/".join(pathparts[3:len(pathparts)]) + file_path = "/".join(pathparts[3 : len(pathparts)]) file_fullpath = savepath + "/" + file_path file_fullpath_abs = os.path.abspath(file_fullpath) # get the absolute path if len(file_fullpath_abs) > 260 and platform.system() == "Windows": warnings.warn( - f'Filepaths on Windows are limited to 260 characters. Attempting to download a filepath that is {len(file_fullpath_abs)} characters long. Set the working or savepath directory to be closer to the root directory or enable long path support in Windows.') + f"Filepaths on Windows are limited to 260 characters. Attempting to download a filepath that is {len(file_fullpath_abs)} characters long. Set the working or savepath directory to be closer to the root directory or enable long path support in Windows." + ) # return else: @@ -649,40 +689,51 @@ def download_file(url, savepath, chunk_size=1024, token=None): j = 0 while j < 3: try: - r = requests.get(url, stream=True, - headers={"accept": "application/json", - "User-Agent": usera}, - timeout=(10, 120)) - j = j+5 + r = requests.get( + url, + stream=True, + headers={"accept": "application/json", "User-Agent": usera}, + timeout=(10, 120), + ) + j = j + 5 except Exception: logging.info( - f"File {os.path.basename(url)} could not be downloaded. Re-attempting.") - j = j+1 + f"File {os.path.basename(url)} could not be downloaded. Re-attempting." + ) + j = j + 1 time.sleep(5) else: j = 0 while j < 3: try: - r = requests.get(url, stream=True, - headers={"X-API-TOKEN": token, - "accept": "application/json", - "User-Agent": usera}, - timeout=(10, 120)) - j = j+5 + r = requests.get( + url, + stream=True, + headers={ + "X-API-TOKEN": token, + "accept": "application/json", + "User-Agent": usera, + }, + timeout=(10, 120), + ) + j = j + 5 except Exception: logging.info( - f"File {os.path.basename(url)} could not be downloaded. Re-attempting.") - j = j+1 + f"File {os.path.basename(url)} could not be downloaded. Re-attempting." + ) + j = j + 1 time.sleep(5) - with open(file_fullpath, 'wb') as f: + with open(file_fullpath, "wb") as f: for chunk in r.iter_content(chunk_size=chunk_size): if chunk: f.write(chunk) r.close() - except Exception as e: - logging.info(f"File {os.path.basename(url)} could not be downloaded and was skipped or partially downloaded. If this issue persists, check your network connection and check the NEON Data Portal for outage alerts.") + except Exception: + logging.info( + f"File {os.path.basename(url)} could not be downloaded and was skipped or partially downloaded. If this issue persists, check your network connection and check the NEON Data Portal for outage alerts." + ) # print(e) pass @@ -704,14 +755,13 @@ def readme_url(readmepath): -------- DataFrame A pandas data frame of the readme content. + """ -""" - - rdres = requests.get(readmepath, - headers={"accept": "application/json", - "User-Agent": usera}) + rdres = requests.get( + readmepath, headers={"accept": "application/json", "User-Agent": usera} + ) rdtxt = rdres.text rdlst = rdtxt.split("\n") rdfrm = pd.DataFrame(rdlst) - return(rdfrm) + return rdfrm diff --git a/src/neonutilities/helper_mods/metadata_helpers.py b/src/neonutilities/helper_mods/metadata_helpers.py index 64a652c..7ab18c0 100644 --- a/src/neonutilities/helper_mods/metadata_helpers.py +++ b/src/neonutilities/helper_mods/metadata_helpers.py @@ -80,16 +80,11 @@ def convert_byte_size(size_bytes): '2.8 GB' >>> convert_byte_size(4000000000000) - '3.6 TB' -""" - si_prefix = [[40, 'T'], - [30, 'G'], - [20, 'M'], - [10, 'K'], - [ 0, '' ]] - + '3.6 TB'""" + si_prefix = [[40, "T"], [30, "G"], [20, "M"], [10, "K"], [0, ""]] + for si_row in si_prefix: - if (size_bytes >= 2 ** si_row[0]): + if size_bytes >= 2 ** si_row[0]: break - - return f'{(size_bytes / 2 ** si_row[0]):.1f} {si_row[1]}B' + + return f"{(size_bytes / 2 ** si_row[0]):.1f} {si_row[1]}B" diff --git a/src/neonutilities/read_table_neon.py b/src/neonutilities/read_table_neon.py index cdf5cc0..1147ed6 100644 --- a/src/neonutilities/read_table_neon.py +++ b/src/neonutilities/read_table_neon.py @@ -6,7 +6,8 @@ import pyarrow as pa from pyarrow import dataset import logging -logging.basicConfig(level=logging.INFO, format='%(message)s') + +logging.basicConfig(level=logging.INFO, format="%(message)s") def get_variables(v): @@ -40,7 +41,11 @@ def get_variables(v): if v.dataType[i] in ["string", "uri"]: typ = pa.string() if v.dataType[i] == "dateTime": - if v.pubFormat[i] in ["yyyy-MM-dd'T'HH:mm:ss'Z'(floor)", "yyyy-MM-dd'T'HH:mm:ss'Z'", "yyyy-MM-dd'T'HH:mm:ss'Z'(round)"]: + if v.pubFormat[i] in [ + "yyyy-MM-dd'T'HH:mm:ss'Z'(floor)", + "yyyy-MM-dd'T'HH:mm:ss'Z'", + "yyyy-MM-dd'T'HH:mm:ss'Z'(round)", + ]: typ = pa.timestamp("s", tz="UTC") else: if v.pubFormat[i] in ["yyyy-MM-dd(floor)", "yyyy-MM-dd"]: @@ -50,7 +55,7 @@ def get_variables(v): typ = pa.int64() else: typ = pa.string() - if i==0: + if i == 0: vschema = pa.schema([(nm, typ)]) else: nfield = pa.field(nm, typ) @@ -59,9 +64,7 @@ def get_variables(v): return vschema -def read_table_neon(data_file, - var_file - ): +def read_table_neon(data_file, var_file): """ Read a NEON data table with correct data types for each variable. @@ -70,7 +73,7 @@ def read_table_neon(data_file, ------------------- data_file: str Filepath to a data table to load. - + var_file: str Filepath to a variables file matching the data table. @@ -87,22 +90,28 @@ def read_table_neon(data_file, @author: Zachary Nickerson """ - + # Read in variables file and check type if isinstance(var_file, str): try: v = pd.read_csv(var_file) except Exception: - logging.info("Table read failed because var_file must be either a NEON variables table or a file path to a NEON variables table.") + logging.info( + "Table read failed because var_file must be either a NEON variables table or a file path to a NEON variables table." + ) return # Check this is a valid variables file - if any(x in ['category', 'system', 'stat'] for x in list(v.columns)): - print('var_file appears to match DP4.00200.001. Data wrangling for surface-atmosphere exchange data is currently only available in the R package version of neonUtilities.') + if any(x in ["category", "system", "stat"] for x in list(v.columns)): + print( + "var_file appears to match DP4.00200.001. Data wrangling for surface-atmosphere exchange data is currently only available in the R package version of neonUtilities." + ) return else: - if not any(x in ['table', 'fieldName', 'dataType'] for x in list(v.columns)): - logging.info('var_file is not a variables file, or is missing critical values.') + if not any(x in ["table", "fieldName", "dataType"] for x in list(v.columns)): + logging.info( + "var_file is not a variables file, or is missing critical values." + ) return # get field names from the data table without loading table @@ -118,15 +127,19 @@ def read_table_neon(data_file, # Check that most fields have a corrsponding value in variables m = len(tabcols) - len(tableschema) if m == len(tabcols): - logging.info("Cannot correct data types because the variables file does not match the data file.") + logging.info( + "Cannot correct data types because the variables file does not match the data file." + ) return if m > 4: - logging.info(f"{m} fieldNames are present in data file but not in variables file. Data load may be affected; if possible, unknown fields are read as character strings.") + logging.info( + f"{m} fieldNames are present in data file but not in variables file. Data load may be affected; if possible, unknown fields are read as character strings." + ) # read data and append file names dat = dataset.dataset(source=data_file, format="csv", schema=tableschema) dattab = dat.to_table() - pdat = dattab.to_pandas() + pdat = dattab.to_pandas() return pdat @@ -151,7 +164,7 @@ def date_convert(dates): Created on 2 May 2024 @author: Claire Lunch - """ + """ try: dout = pd.to_datetime(dates, format="ISO8601", utc=True) @@ -203,13 +216,11 @@ def get_variables_pandas(v): if v["dataType"][i] == "dateTime": typ = "datetime64[ns, UTC]" dtdict[nm] = typ - + return dtdict -def cast_table_neon(data_table, - var_table - ): +def cast_table_neon(data_table, var_table): """ Cast a NEON data table to the correct data types for each variable, if possible. @@ -232,32 +243,38 @@ def cast_table_neon(data_table, @author: Claire Lunch """ - + # Check inputs formatting if not isinstance(data_table, pd.DataFrame): logging.info("Data table input is not a pandas data frame.") return None - + if not isinstance(var_table, pd.DataFrame): logging.info("Variables table input is not a pandas data frame.") return None - + # Check this is a valid variables file - if any(x in ['category', 'system', 'stat'] for x in list(var_table.columns)): - logging.info('var_table appears to match DP4.00200.001. Data wrangling for surface-atmosphere exchange data is currently only available in the R package version of neonUtilities.') + if any(x in ["category", "system", "stat"] for x in list(var_table.columns)): + logging.info( + "var_table appears to match DP4.00200.001. Data wrangling for surface-atmosphere exchange data is currently only available in the R package version of neonUtilities." + ) return None else: - if not any(x in ['table', 'fieldName', 'dataType'] for x in list(var_table.columns)): - logging.info('var_table is not a variables file, or is missing critical values.') + if not any( + x in ["table", "fieldName", "dataType"] for x in list(var_table.columns) + ): + logging.info( + "var_table is not a variables file, or is missing critical values." + ) return None # get data types vdt = get_variables_pandas(var_table) - + # get field names from the data table tabcols = list(data_table.columns) cast_table = data_table - + # iterate over columns and try to cast each for i in tabcols: if i not in vdt.keys(): @@ -265,18 +282,22 @@ def cast_table_neon(data_table, else: if vdt[i] in ["Float64", "Int64"]: try: - dtemp = cast_table[i].replace(r'^\s*$', np.nan, regex=True) + dtemp = cast_table[i].replace(r"^\s*$", np.nan, regex=True) cast_table[i] = dtemp.astype(vdt[i]) except Exception: - logging.info(f"Field {i} could not be cast to type {vdt[i]}. Data read as string type.") + logging.info( + f"Field {i} could not be cast to type {vdt[i]}. Data read as string type." + ) cast_table[i] = data_table[i] continue - if vdt[i]=="datetime64[ns, UTC]" and not i=="publicationDate": + if vdt[i] == "datetime64[ns, UTC]" and not i == "publicationDate": try: cast_table[i] = date_convert(data_table[i]) except Exception: - logging.info(f"Field {i} could not be cast to type {vdt[i]}. Data read as string type.") + logging.info( + f"Field {i} could not be cast to type {vdt[i]}. Data read as string type." + ) cast_table[i] = data_table[i] continue - + return cast_table diff --git a/src/neonutilities/tabular_download.py b/src/neonutilities/tabular_download.py index 4f3faaa..d89e37e 100644 --- a/src/neonutilities/tabular_download.py +++ b/src/neonutilities/tabular_download.py @@ -12,14 +12,23 @@ from .helper_mods.api_helpers import download_urls from .helper_mods.metadata_helpers import convert_byte_size from . import __resources__ -logging.basicConfig(level=logging.INFO, format='%(message)s') - - -def query_files(lst, dpid, site="all", startdate=None, enddate=None, - package="basic", release="current", - timeindex="all", tabl="all", - include_provisional=False, token=None): +logging.basicConfig(level=logging.INFO, format="%(message)s") + + +def query_files( + lst, + dpid, + site="all", + startdate=None, + enddate=None, + package="basic", + release="current", + timeindex="all", + tabl="all", + include_provisional=False, + token=None, +): """ Use the query endpoint to get a list of data package files from NEON. @@ -52,7 +61,11 @@ def query_files(lst, dpid, site="all", startdate=None, enddate=None, # check expanded package status if package == "expanded": if not lst["data"]["productHasExpanded"]: - logging.info("No expanded package found for " + dpid + ". Basic package downloaded instead.") + logging.info( + "No expanded package found for " + + dpid + + ". Basic package downloaded instead." + ) package = "basic" # if sites are not specified, get list of sites with data @@ -96,7 +109,16 @@ def query_files(lst, dpid, site="all", startdate=None, enddate=None, relurl = "&release=" + release # construct full query url and run query - qurl = "https://data.neonscience.org/api/v0/data/query?productCode=" + dpid + sitesurl + dateurl + ipurl + "&package=" + package + relurl + qurl = ( + "https://data.neonscience.org/api/v0/data/query?productCode=" + + dpid + + sitesurl + + dateurl + + ipurl + + "&package=" + + package + + relurl + ) qreq = get_api(api_url=qurl, token=token) if qreq is None: logging.info("No API response for selected query. Check inputs.") @@ -114,85 +136,113 @@ def query_files(lst, dpid, site="all", startdate=None, enddate=None, fdict = packdict[j].get("files") for k in range(0, len(fdict)): flurl.append(fdict[k].get("url")) - releasedict[re.sub(pattern="https://storage.googleapis.com/", - repl="", string=fdict[k].get("url"))] = rdict + releasedict[ + re.sub( + pattern="https://storage.googleapis.com/", + repl="", + string=fdict[k].get("url"), + ) + ] = rdict # if timeindex or tabl are set, subset the list of files if timeindex == "all" and tabl == "all": return [flurl, releasedict] else: if timeindex != "all" and tabl != "all": - raise ValueError("Only one of timeindex or tabl can be specified, not both.") + raise ValueError( + "Only one of timeindex or tabl can be specified, not both." + ) else: - if timeindex!="all": - tt = re.compile(str(timeindex) + "min|" + str(timeindex) + "_min|science_review_flags|variables|readme|sensor_positions|categoricalCodes") + if timeindex != "all": + tt = re.compile( + str(timeindex) + + "min|" + + str(timeindex) + + "_min|science_review_flags|variables|readme|sensor_positions|categoricalCodes" + ) if tabl != "all": - tt = re.compile("[.]" + tabl + "[.]|variables|readme|sensor_positions|categoricalCodes") + tt = re.compile( + "[.]" + + tabl + + "[.]|variables|readme|sensor_positions|categoricalCodes" + ) flurlsub = [f for f in flurl if tt.search(f)] releasedictsub = {r: v for r, v in releasedict.items() if tt.search(r)} return [flurlsub, releasedictsub] -def zips_by_product(dpid, site="all", startdate=None, enddate=None, - package="basic", release="current", - timeindex="all", tabl="all", check_size=True, - include_provisional=False, cloud_mode=False, - progress=True, token=None, savepath=None): + +def zips_by_product( + dpid, + site="all", + startdate=None, + enddate=None, + package="basic", + release="current", + timeindex="all", + tabl="all", + check_size=True, + include_provisional=False, + cloud_mode=False, + progress=True, + token=None, + savepath=None, +): """ - This function queries the NEON API for data by data product, site(s), and - month(s), and downloads the corresponding data packages. Use this function - to download NEON observational (OS) and instrument (IS) data; for remote + This function queries the NEON API for data by data product, site(s), and + month(s), and downloads the corresponding data packages. Use this function + to download NEON observational (OS) and instrument (IS) data; for remote sensing data, use the by_file_aop() and by_tile_aop() functions. Parameters ------------------ dpid: str Data product identifier in the form DP#.#####.### - + site: str Either the string 'all', or one or more 4-letter NEON site codes. Defaults to 'all'. - + startdate: str, optional Earliest date of data to download, in the form YYYY-MM - + enddate: str, optional Latest date of data to download, in the form YYYY-MM - + package: str, optional Download package to access, either basic or expanded. Defaults to 'basic'. - + release: str, optional Data release to download. Defaults to the most recent release. - + timeindex: str, optional Either 'all', or the time index of data to download, in minutes. Only applicable to sensor (IS) data. Defaults to 'all'. - + tabl: str, optional Either the string 'all', or the name of a single data table to download. Only applicable to observational (OS) data. Defaults to 'all'. - + check_size: bool, optional True or False, should the user approve the total file size before downloading? Defaults to True. When working in batch mode, or other non-interactive workflow, use check_size=False. - + include_provisional: bool, optional - Should Provisional data be returned in the download? Defaults to False. See - https://www.neonscience.org/data-samples/data-management/data-revisions-releases + Should Provisional data be returned in the download? Defaults to False. See + https://www.neonscience.org/data-samples/data-management/data-revisions-releases for details on the difference between provisional and released data. - + cloud_mode: bool, optional - Use cloud mode to transfer files cloud-to-cloud? If used, zips_by_product() returns a - list of files rather than downloading them. Defaults to False; in general this - option should be used via load_by_product(), in which zips_by_product() is a + Use cloud mode to transfer files cloud-to-cloud? If used, zips_by_product() returns a + list of files rather than downloading them. Defaults to False; in general this + option should be used via load_by_product(), in which zips_by_product() is a helper function. - + progress: bool, optional Should the function display progress bars as it runs? Defaults to True. - + token: str, optional - User-specific API token from data.neonscience.org user account. See - https://data.neonscience.org/data-api/rate-limiting/ for details about + User-specific API token from data.neonscience.org user account. See + https://data.neonscience.org/data-api/rate-limiting/ for details about API rate limits and user tokens. If omitted, download uses the public rate limit. - + savepath: str, optional File path of location to save data. @@ -215,54 +265,92 @@ def zips_by_product(dpid, site="all", startdate=None, enddate=None, """ # error message if dpid is not formatted correctly - if not re.search(pattern="DP[1-4]{1}.[0-9]{5}.00[0-9]{1}", - string=dpid): - raise ValueError(f"{dpid} is not a properly formatted data product ID. The correct format is DP#.#####.00#") + if not re.search(pattern="DP[1-4]{1}.[0-9]{5}.00[0-9]{1}", string=dpid): + raise ValueError( + f"{dpid} is not a properly formatted data product ID. The correct format is DP#.#####.00#" + ) # error message if package is not basic or expanded - if not package in ["basic","expanded"]: - raise ValueError(f"{package} is not a valid package name. Package must be basic or expanded") + if not package in ["basic", "expanded"]: + raise ValueError( + f"{package} is not a valid package name. Package must be basic or expanded" + ) # error messages for products that can't be downloaded by zips_by_product() # AOP products if dpid[4:5:1] == 3 and dpid != "DP1.30012.001": - raise ValueError(f"{dpid} is a remote sensing data product. Use the by_file_aop() or by_tile_aop() function.") + raise ValueError( + f"{dpid} is a remote sensing data product. Use the by_file_aop() or by_tile_aop() function." + ) # Phenocam products if dpid == "DP1.00033.001" or dpid == "DP1.00042.001": - raise ValueError(f"{dpid} is a phenological image product, data are hosted by Phenocam.") + raise ValueError( + f"{dpid} is a phenological image product, data are hosted by Phenocam." + ) # Aeronet product if dpid == "DP1.00043.001": - raise ValueError(f"Spectral sun photometer ({dpid}) data are hosted by Aeronet.") + raise ValueError( + f"Spectral sun photometer ({dpid}) data are hosted by Aeronet." + ) # DHP expanded package if dpid == "DP1.10017.001" and package == "expanded": - raise ValueError("Digital hemispherical images expanded file packages exceed programmatic download limits. Either download from the data portal, or download the basic package and use the URLs in the data to download the images themselves. Follow instructions in the Data Product User Guide for image file naming.") + raise ValueError( + "Digital hemispherical images expanded file packages exceed programmatic download limits. Either download from the data portal, or download the basic package and use the URLs in the data to download the images themselves. Follow instructions in the Data Product User Guide for image file naming." + ) # individual SAE products - if dpid in ['DP1.00007.001', 'DP1.00010.001', 'DP1.00034.001', 'DP1.00035.001', - 'DP1.00036.001', 'DP1.00037.001', 'DP1.00099.001', 'DP1.00100.001', - 'DP2.00008.001', 'DP2.00009.001', 'DP2.00024.001', 'DP3.00008.001', - 'DP3.00009.001', 'DP3.00010.001', 'DP4.00002.001', 'DP4.00007.001', - 'DP4.00067.001', 'DP4.00137.001', 'DP4.00201.001', 'DP1.00030.001']: - raise ValueError(f"{dpid} is only available in the bundled eddy covariance data product. Download DP4.00200.001 to access these data.") + if dpid in [ + "DP1.00007.001", + "DP1.00010.001", + "DP1.00034.001", + "DP1.00035.001", + "DP1.00036.001", + "DP1.00037.001", + "DP1.00099.001", + "DP1.00100.001", + "DP2.00008.001", + "DP2.00009.001", + "DP2.00024.001", + "DP3.00008.001", + "DP3.00009.001", + "DP3.00010.001", + "DP4.00002.001", + "DP4.00007.001", + "DP4.00067.001", + "DP4.00137.001", + "DP4.00201.001", + "DP1.00030.001", + ]: + raise ValueError( + f"{dpid} is only available in the bundled eddy covariance data product. Download DP4.00200.001 to access these data." + ) # check for incompatible values of release= and include_provisional= if release == "PROVISIONAL" and not include_provisional: - raise ValueError("Download request is for release=PROVISIONAL. To download PROVISIONAL data, enter input parameter include_provisional=True.") + raise ValueError( + "Download request is for release=PROVISIONAL. To download PROVISIONAL data, enter input parameter include_provisional=True." + ) if re.search(pattern="RELEASE", string=release) is not None and include_provisional: - logging.info(f"Download request is for release={release} but include_provisional=True. Only data in {release} will be downloaded.") + logging.info( + f"Download request is for release={release} but include_provisional=True. Only data in {release} will be downloaded." + ) # error message if dates aren't formatted correctly # separate logic for each, to easily allow only one to be NA if startdate is not None: if re.search(pattern="[0-9]{4}-[0-9]{2}", string=startdate) is None: - raise ValueError("startdate and enddate must be either None or valid dates in the form YYYY-MM") + raise ValueError( + "startdate and enddate must be either None or valid dates in the form YYYY-MM" + ) if enddate is not None: if re.search(pattern="[0-9]{4}-[0-9]{2}", string=enddate) is None: - raise ValueError("startdate and enddate must be either None or valid dates in the form YYYY-MM") + raise ValueError( + "startdate and enddate must be either None or valid dates in the form YYYY-MM" + ) # can only specify timeindex xor tabl if timeindex != "all" and tabl != "all": @@ -274,7 +362,9 @@ def zips_by_product(dpid, site="all", startdate=None, enddate=None, site = [site] # redirect for aqu met products and bundles - shared_aquatic_file = (importlib_resources.files(__resources__)/"shared_aquatic.csv") + shared_aquatic_file = ( + importlib_resources.files(__resources__) / "shared_aquatic.csv" + ) shared_aquatic_df = pd.read_csv(shared_aquatic_file, index_col="site") if site != ["all"]: @@ -285,10 +375,12 @@ def zips_by_product(dpid, site="all", startdate=None, enddate=None, ss = shared_aquatic_df.loc[s] if dpid in list(ss["product"]): indx = indx + 1 - sx = list(ss["towerSite"][ss["product"]==dpid]) + sx = list(ss["towerSite"][ss["product"] == dpid]) siter.append(sx) if indx == 1: - logging.info(f"Some sites in your download request are aquatic sites where {dpid} is collected at a nearby terrestrial site. The sites you requested, and the sites that will be accessed instead, are listed below.") + logging.info( + f"Some sites in your download request are aquatic sites where {dpid} is collected at a nearby terrestrial site. The sites you requested, and the sites that will be accessed instead, are listed below." + ) logging.info(f"{s} -> {''.join(sx)}") else: siter.append([s]) @@ -299,43 +391,66 @@ def zips_by_product(dpid, site="all", startdate=None, enddate=None, siter = site # redirect for chemistry bundles - chem_bundles_file = (importlib_resources.files(__resources__)/"chem_bundles.csv") + chem_bundles_file = importlib_resources.files(__resources__) / "chem_bundles.csv" chem_bundles_df = pd.read_csv(chem_bundles_file) if dpid in list(chem_bundles_df["product"]): - newDPID = list(chem_bundles_df["homeProduct"][chem_bundles_df["product"]==dpid]) + newDPID = list( + chem_bundles_df["homeProduct"][chem_bundles_df["product"] == dpid] + ) if newDPID == ["depends"]: - raise ValueError("Root chemistry and isotopes have been bundled with the root biomass data. For root chemistry from Megapits, download DP1.10066.001. For root chemistry from periodic sampling, download DP1.10067.001.") + raise ValueError( + "Root chemistry and isotopes have been bundled with the root biomass data. For root chemistry from Megapits, download DP1.10066.001. For root chemistry from periodic sampling, download DP1.10067.001." + ) else: - raise ValueError(f"{''.join(dpid)} has been bundled with {''.join(newDPID)} and is not available independently. Please download {''.join(newDPID)}.") + raise ValueError( + f"{''.join(dpid)} has been bundled with {''.join(newDPID)} and is not available independently. Please download {''.join(newDPID)}." + ) # redirect for veg structure and sediment data product bundles - other_bundles_file = (importlib_resources.files(__resources__)/"other_bundles.csv") + other_bundles_file = importlib_resources.files(__resources__) / "other_bundles.csv" other_bundles_df = pd.read_csv(other_bundles_file) if dpid in list(other_bundles_df["product"]): - bundle_release = other_bundles_df["lastRelease"][other_bundles_df["product"]==dpid].values[0] - if release>bundle_release: - newDPID = list(other_bundles_df["homeProduct"][other_bundles_df["product"]==dpid]) - raise ValueError(f"In all releases after {bundle_release}, {''.join(dpid)} has been bundled with {''.join(newDPID)} and is not available independently. Please download {''.join(newDPID)}.") + bundle_release = other_bundles_df["lastRelease"][ + other_bundles_df["product"] == dpid + ].values[0] + if release > bundle_release: + newDPID = list( + other_bundles_df["homeProduct"][other_bundles_df["product"] == dpid] + ) + raise ValueError( + f"In all releases after {bundle_release}, {''.join(dpid)} has been bundled with {''.join(newDPID)} and is not available independently. Please download {''.join(newDPID)}." + ) # end of error and exception handling, start the work # query the /products endpoint for the product requested if release == "current" or release == "PROVISIONAL": - prodreq = get_api(api_url="https://data.neonscience.org/api/v0/products/" - + dpid, token=token) + prodreq = get_api( + api_url="https://data.neonscience.org/api/v0/products/" + dpid, token=token + ) else: - prodreq = get_api(api_url="https://data.neonscience.org/api/v0/products/" - + dpid + "?release=" + release, token=token) + prodreq = get_api( + api_url="https://data.neonscience.org/api/v0/products/" + + dpid + + "?release=" + + release, + token=token, + ) if prodreq is None: if release == "LATEST": - logging.info(f"No data found for product {dpid}. LATEST data requested; check that token is valid for LATEST access.") + logging.info( + f"No data found for product {dpid}. LATEST data requested; check that token is valid for LATEST access." + ) return else: if release != "current" and release != "PROVISIONAL": - rels = get_api(api_url="https://data.neonscience.org/api/v0/releases/", - token=token) + rels = get_api( + api_url="https://data.neonscience.org/api/v0/releases/", token=token + ) if rels is None: - raise ConnectionError("Data product was not found or API was unreachable.") + raise ConnectionError( + "Data product was not found or API was unreachable." + ) relj = rels.json() reld = relj["data"] rellist = [] @@ -344,10 +459,14 @@ def zips_by_product(dpid, site="all", startdate=None, enddate=None, if release not in rellist: raise ValueError(f"Release not found. Valid releases are {rellist}") else: - raise ConnectionError("Data product was not found or API was unreachable.") + raise ConnectionError( + "Data product was not found or API was unreachable." + ) else: - raise ConnectionError("Data product was not found or API was unreachable.") - + raise ConnectionError( + "Data product was not found or API was unreachable." + ) + avail = prodreq.json() # error message if product or data not found @@ -360,18 +479,25 @@ def zips_by_product(dpid, site="all", startdate=None, enddate=None, pass # check that token was used - if 'x-ratelimit-limit' in prodreq.headers and token is not None: - if prodreq.headers.get('x-ratelimit-limit') == 200: + if "x-ratelimit-limit" in prodreq.headers and token is not None: + if prodreq.headers.get("x-ratelimit-limit") == 200: logging.info("API token was not recognized. Public rate limit applied.") # use query endpoint if cloud mode selected if cloud_mode: - fls = query_files(lst=avail, dpid=dpid, site=site, - startdate=startdate, enddate=enddate, - package=package, release=release, - timeindex=timeindex, tabl=tabl, - include_provisional=include_provisional, - token=token) + fls = query_files( + lst=avail, + dpid=dpid, + site=site, + startdate=startdate, + enddate=enddate, + package=package, + release=release, + timeindex=timeindex, + tabl=tabl, + include_provisional=include_provisional, + token=token, + ) return fls else: @@ -407,10 +533,12 @@ def zips_by_product(dpid, site="all", startdate=None, enddate=None, # subset by start date if startdate is not None: ste = re.compile("20[0-9]{2}-[0-9]{2}") - start_urls = [st for st in site_urls if ste.search(st).group(0)>=startdate] + start_urls = [ + st for st in site_urls if ste.search(st).group(0) >= startdate + ] else: start_urls = site_urls - + # check for no results if len(start_urls) == 0: logging.info("There are no data at the selected date(s).") @@ -419,7 +547,7 @@ def zips_by_product(dpid, site="all", startdate=None, enddate=None, # subset by end date if enddate is not None: ete = re.compile("20[0-9]{2}-[0-9]{2}") - end_urls = [et for et in start_urls if ete.search(et).group(0)<=enddate] + end_urls = [et for et in start_urls if ete.search(et).group(0) <= enddate] else: end_urls = start_urls @@ -430,42 +558,61 @@ def zips_by_product(dpid, site="all", startdate=None, enddate=None, # if downloading entire site-months, pass to get_zip_urls to query each month for url if timeindex == "all" and tabl == "all": - durls = get_zip_urls(url_set=end_urls, package=package, release=release, - include_provisional=include_provisional, - token=token, progress=progress) + durls = get_zip_urls( + url_set=end_urls, + package=package, + release=release, + include_provisional=include_provisional, + token=token, + progress=progress, + ) else: # if downloading by table or averaging interval, pass to get_tab_urls - durls = get_tab_urls(url_set=end_urls, package=package, release=release, - include_provisional=include_provisional, - timeindex=timeindex, tabl=tabl, - token=token, progress=progress) + durls = get_tab_urls( + url_set=end_urls, + package=package, + release=release, + include_provisional=include_provisional, + timeindex=timeindex, + tabl=tabl, + token=token, + progress=progress, + ) # check download size download_size = convert_byte_size(sum(durls["sz"])) if check_size: - if input(f"Continuing will download {len(durls['z'])} files totaling approximately {download_size}. Do you want to proceed? (y/n) ") != "y": + if ( + input( + f"Continuing will download {len(durls['z'])} files totaling approximately {download_size}. Do you want to proceed? (y/n) " + ) + != "y" + ): logging.info("Download halted.") return None else: - logging.info(f"Downloading {len(durls['z'])} files totaling approximately {download_size}.") + logging.info( + f"Downloading {len(durls['z'])} files totaling approximately {download_size}." + ) # set up folder to save to if savepath is None: savepath = os.getcwd() - outpath = savepath+"/filesToStack"+dpid[4:9]+"/" + outpath = savepath + "/filesToStack" + dpid[4:9] + "/" if not os.path.exists(outpath): os.makedirs(outpath) else: - logging.info("Warning: Download folder already exists. Check carefully for duplicate files.") + logging.info( + "Warning: Download folder already exists. Check carefully for duplicate files." + ) if timeindex != "all" or tabl != "all": for f in durls["flpth"]: - if not os.path.exists(outpath+f): - os.makedirs(outpath+f) + if not os.path.exists(outpath + f): + os.makedirs(outpath + f) # download data from each url - download_urls(url_set=durls, outpath=outpath, - token=token, progress=progress) + download_urls(url_set=durls, outpath=outpath, token=token, progress=progress) return None diff --git a/src/neonutilities/unzip_and_stack.py b/src/neonutilities/unzip_and_stack.py index 9838915..bc92220 100644 --- a/src/neonutilities/unzip_and_stack.py +++ b/src/neonutilities/unzip_and_stack.py @@ -20,20 +20,23 @@ from .read_table_neon import get_variables, cast_table_neon from . import __resources__ import logging -logging.basicConfig(level=logging.INFO, format='%(message)s') - -varschema = pa.schema([ - ('table', pa.string()), - ('fieldName', pa.string()), - ('description', pa.string()), - ('dataType', pa.string()), - ('units', pa.string()), - ('sampleCode', pa.string()), - ('downloadPkg', pa.string()), - ('pubFormat', pa.string()), - ('primaryKey', pa.string()), - ('categoricalCodeName', pa.string()) -]) + +logging.basicConfig(level=logging.INFO, format="%(message)s") + +varschema = pa.schema( + [ + ("table", pa.string()), + ("fieldName", pa.string()), + ("description", pa.string()), + ("dataType", pa.string()), + ("units", pa.string()), + ("sampleCode", pa.string()), + ("downloadPkg", pa.string()), + ("pubFormat", pa.string()), + ("primaryKey", pa.string()), + ("categoricalCodeName", pa.string()), + ] +) def unzip_zipfile(zippath): @@ -56,13 +59,13 @@ def unzip_zipfile(zippath): @author: Zachary Nickerson - Updated Feb 2025 to use "with" so that zip files properly close out + Updated Feb 2025 to use "with" so that zip files properly close out and fix error handling for Windows filepath character length limits """ # Error handling on inputs - if zippath[-4:] in ['.zip', '.ZIP']: + if zippath[-4:] in [".zip", ".ZIP"]: outpath = os.path.dirname(zippath) level = "all" else: @@ -70,22 +73,30 @@ def unzip_zipfile(zippath): level = "in" if level == "all": - with zipfile.ZipFile(zippath, 'r') as zip_ref: - tl = zip_ref.namelist(); #print('tl:',tl) + with zipfile.ZipFile(zippath, "r") as zip_ref: + tl = zip_ref.namelist() + # print('tl:',tl) # Construct full paths as they will be after extraction # this is used in stack_by_table, eg. # litterlst = nu.stack_by_table("./testdata/NEON_litterfall.zip", savepath="envt") - full_extracted_paths = [os.path.join(os.path.abspath(zippath).replace('.zip',''), zipname) for zipname in tl] + full_extracted_paths = [ + os.path.join(os.path.abspath(zippath).replace(".zip", ""), zipname) + for zipname in tl + ] # print('full_extracted_paths:',full_extracted_paths) # print('len(full_extracted_paths):',[len(x) for x in full_extracted_paths]) # Error handling for filepath character length limitations on Windows # This is a warning - if platform.system() == "Windows" and any(len(x) > 260 for x in full_extracted_paths): + if platform.system() == "Windows" and any( + len(x) > 260 for x in full_extracted_paths + ): longest_path = max(full_extracted_paths, key=len) - warnings.warn("Filepaths on Windows are limited to 260 characters. " - f"Attempting to extract a filepath that is > 260 characters long. " - "Move your working or savepath directory closer to the root directory or enable " - "long path support in Windows.") + warnings.warn( + "Filepaths on Windows are limited to 260 characters. " + "Attempting to extract a filepath that is > 260 characters long. " + "Move your working or savepath directory closer to the root directory or enable " + "long path support in Windows." + ) try: # Unzip file and get list of zips within (this will still fail if paths are too long on Windows) @@ -95,52 +106,62 @@ def unzip_zipfile(zippath): # this might be deprecated (shouldn't happen any more). level as an input might also be deprecated # leaving in in case anything changes in tabular data if len(zps) > 0: - print('Multiple zip files are contained within the parent zip file. Need an example to properly code this up.\n') + print( + "Multiple zip files are contained within the parent zip file. Need an example to properly code this up.\n" + ) except FileNotFoundError as e: print(e) - raise OSError("Filepaths on Windows are limited to 260 characters. " - f"Attempting to extract a filepath that is {len(longest_path)} characters long. " - "Move your working or savepath directory closer to the root directory or enable " - "long path support in Windows.") - + raise OSError( + "Filepaths on Windows are limited to 260 characters. " + f"Attempting to extract a filepath that is {len(longest_path)} characters long. " + "Move your working or savepath directory closer to the root directory or enable " + "long path support in Windows." + ) if level == "in": - zps = glob.glob(outpath+"/*.zip") + zps = glob.glob(outpath + "/*.zip") for i in range(0, len(zps)): - with zipfile.ZipFile(zps[i], 'r') as zip_refi: + with zipfile.ZipFile(zps[i], "r") as zip_refi: tl = zip_refi.namelist() # Construct full paths as they will be after extraction - full_extracted_paths = [os.path.join( - zps[i].replace('.zip',''), zipname) for zipname in tl] + full_extracted_paths = [ + os.path.join(zps[i].replace(".zip", ""), zipname) for zipname in tl + ] longest_path = max(full_extracted_paths, key=len) # print('full extracted paths:',full_extracted_paths) # print('len(full_extracted_paths):',[len(x) for x in full_extracted_paths]) # Warning for filepath character length limitations on Windows - if platform.system() == "Windows" and any(len(x) > 260 for x in full_extracted_paths): - warnings.warn("Filepaths on Windows are limited to 260 characters. " - f"Attempting to extract a filepath that is > 260 characters long. " - "Move your working or savepath directory closer to the root directory " - "or enable long path support in Windows.", UserWarning) + if platform.system() == "Windows" and any( + len(x) > 260 for x in full_extracted_paths + ): + warnings.warn( + "Filepaths on Windows are limited to 260 characters. " + "Attempting to extract a filepath that is > 260 characters long. " + "Move your working or savepath directory closer to the root directory " + "or enable long path support in Windows.", + UserWarning, + ) try: outpathi = zps[i][:-4] zip_refi.extractall(path=outpathi) except FileNotFoundError as e: - raise OSError("ERROR: Filepaths on Windows are limited to 260 characters. " - f"Attempting to extract a filepath that is {len(longest_path)} characters long. " - "Move your working or savepath directory closer to the root directory or enable " - "long path support in Windows.") + raise OSError( + "ERROR: Filepaths on Windows are limited to 260 characters. " + f"Attempting to extract a filepath that is {len(longest_path)} characters long. " + "Move your working or savepath directory closer to the root directory or enable " + "long path support in Windows." + ) print(e) os.remove(zps[i]) return None -def find_datatables(folder, - f_names=True): +def find_datatables(folder, f_names=True): """ Find data tables @@ -165,7 +186,7 @@ def find_datatables(folder, """ # get all .csv files in the folder and its subfolders - csv_files = glob.glob(os.path.join(folder, '**', '*.csv'), recursive=True) + csv_files = glob.glob(os.path.join(folder, "**", "*.csv"), recursive=True) # if fnames is True, return the full file paths; otherwise, return the base names if f_names: @@ -178,7 +199,7 @@ def get_recent_publication(filepaths): """ Returns the most recent files for those that do not need stacking - + Parameters -------- filepaths: List of file paths @@ -193,7 +214,9 @@ def get_recent_publication(filepaths): """ # extract the publication dates from the file paths - pub_dates = [re.search(r'20\d{2}\d{2}\d{2}', os.path.basename(f)) for f in filepaths] + pub_dates = [ + re.search(r"20\d{2}\d{2}\d{2}", os.path.basename(f)) for f in filepaths + ] pub_dates = [m.group(0) for m in pub_dates if m is not None] # get the most recent publication date @@ -205,7 +228,6 @@ def get_recent_publication(filepaths): return recent_files - def string_schema(v): """ @@ -325,7 +347,9 @@ def find_table_types(datatables): td.append(ss) if len(td) == 0: - logging.info("No data tables found, only metadata. Try downloading expanded package, and check availability on the NEON data portal.") + logging.info( + "No data tables found, only metadata. Try downloading expanded package, and check availability on the NEON data portal." + ) return else: tn = list(set(td)) @@ -338,7 +362,9 @@ def find_table_types(datatables): ttklist = list(map(table_type_formats, tnknames)) ttk = list(set(ttklist)) if len(ttk) > 1: - raise ValueError(f"In files to be stacked, table {tnk} has been published under conflicting schedules. To avoid this problem, either work only with released data, or stack released and provisional data separately.") + raise ValueError( + f"In files to be stacked, table {tnk} has been published under conflicting schedules. To avoid this problem, either work only with released data, or stack released and provisional data separately." + ) return else: tt[tnk] = ttk[0] @@ -417,7 +443,7 @@ def remove_srf_dups(srftab): idset = list(set(srftab["srfID"])) srfsublist = [] for i in idset: - srfsubi = srftab[srftab.srfID==i] + srfsubi = srftab[srftab.srfID == i] indi = srfsubi.idxmax()["lastUpdateDateTime"] srfsublist.append(srfsubi.loc[indi]) srfsub = pd.DataFrame(srfsublist) @@ -442,16 +468,19 @@ def align_sp_cols(sptab): @author: Claire Lunch """ - oldcols = {"name": "sensorLocationID", - "description": "sensorLocationDescription", - "start": "positionStartDateTime", "end": "positionEndDateTime", - "referenceName": "referenceLocationID", - "referenceDescription": "referenceLocationIDDescription", - "referenceStart": "referenceLocationIDStartDateTime", - "referenceEnd": "referenceLocationIDEndDateTime", - "referenceLatitude": "locationReferenceLatitude", - "referenceLongitude": "locationReferenceLongitude", - "referenceElevation": "locationReferenceElevation"} + oldcols = { + "name": "sensorLocationID", + "description": "sensorLocationDescription", + "start": "positionStartDateTime", + "end": "positionEndDateTime", + "referenceName": "referenceLocationID", + "referenceDescription": "referenceLocationIDDescription", + "referenceStart": "referenceLocationIDStartDateTime", + "referenceEnd": "referenceLocationIDEndDateTime", + "referenceLatitude": "locationReferenceLatitude", + "referenceLongitude": "locationReferenceLongitude", + "referenceElevation": "locationReferenceElevation", + } for k in list(oldcols.keys()): if all(sptab[k].isna()): sptab.drop(columns=k, inplace=True) @@ -483,65 +512,72 @@ def sort_dat(pdata): # sort the table by site, then HOR/VER, then date, all ascending pcols = pdata.columns.to_list() datevar = None - if 'collectDate' in pcols: - datevar = 'collectDate' + if "collectDate" in pcols: + datevar = "collectDate" else: - if 'endDate' in pcols: - datevar = 'endDate' + if "endDate" in pcols: + datevar = "endDate" else: - if 'startDate' in pcols: - datevar = 'startDate' + if "startDate" in pcols: + datevar = "startDate" else: - if 'date' in pcols: - datevar = 'date' + if "date" in pcols: + datevar = "date" else: - if 'endDateTime' in pcols: - datevar = 'endDateTime' + if "endDateTime" in pcols: + datevar = "endDateTime" else: - if 'startDateTime' in pcols: - datevar = 'startDateTime' - if 'horizontalPosition' not in pcols: + if "startDateTime" in pcols: + datevar = "startDateTime" + if "horizontalPosition" not in pcols: try: if datevar is None: - pdata.sort_values(by=['siteID'], - ascending=[True], - inplace=True, ignore_index=True) + pdata.sort_values( + by=["siteID"], ascending=[True], inplace=True, ignore_index=True + ) else: - pdata.sort_values(by=['siteID', datevar], - ascending=[True,True], - inplace=True, ignore_index=True) + pdata.sort_values( + by=["siteID", datevar], + ascending=[True, True], + inplace=True, + ignore_index=True, + ) except Exception: try: - pdata.sort_values(by=[datevar], - ascending=[True], - inplace=True, ignore_index=True) + pdata.sort_values( + by=[datevar], ascending=[True], inplace=True, ignore_index=True + ) except Exception: pass else: try: if datevar is None: - pdata.sort_values(by=['siteID', 'horizontalPosition', 'verticalPosition'], - ascending=[True, True, True], - inplace=True, ignore_index=True) + pdata.sort_values( + by=["siteID", "horizontalPosition", "verticalPosition"], + ascending=[True, True, True], + inplace=True, + ignore_index=True, + ) else: - pdata.sort_values(by=['siteID', 'horizontalPosition', 'verticalPosition', datevar], - ascending=[True, True, True, True], - inplace=True, ignore_index=True) + pdata.sort_values( + by=["siteID", "horizontalPosition", "verticalPosition", datevar], + ascending=[True, True, True, True], + inplace=True, + ignore_index=True, + ) except Exception: pass - - return(pdata) + return pdata -def stack_frame_files(framefiles, dpid, - seqtyp=None, - cloud_mode=False): + +def stack_frame_files(framefiles, dpid, seqtyp=None, cloud_mode=False): """ - Helper function to stack "data frame" files. These files do not go through the normal - publication process, they are stored and published as a fixed unit. NEON uses this - workflow for very large tabular files that can't be handled by the standard OS + Helper function to stack "data frame" files. These files do not go through the normal + publication process, they are stored and published as a fixed unit. NEON uses this + workflow for very large tabular files that can't be handled by the standard OS data pipeline. Parameters @@ -561,48 +597,62 @@ def stack_frame_files(framefiles, dpid, """ # no variables files for these, use custom files in package resources - frame_file_file = (importlib_resources.files(__resources__)/"frame_file_variables.csv") + frame_file_file = ( + importlib_resources.files(__resources__) / "frame_file_variables.csv" + ) frame_file_variables = pd.read_csv(frame_file_file, index_col=None) - #v = pd.concat([v, frame_file_variables], ignore_index=True) - - fdict = {"DP1.30012.001":"FSP", "DP1.10081.001":"MCC", "DP1.20086.001":"MCC", - "DP1.20141.001":"MCC", "DP1.20190.001":"REA", "DP1.20193.001":"REA", - "DP1.10081.002":"MCT", "DP1.20086.002":"MCT", "DP1.20141.002":"MCT", - "DP4.00132.001":"BAT"} - + # v = pd.concat([v, frame_file_variables], ignore_index=True) + + fdict = { + "DP1.30012.001": "FSP", + "DP1.10081.001": "MCC", + "DP1.20086.001": "MCC", + "DP1.20141.001": "MCC", + "DP1.20190.001": "REA", + "DP1.20193.001": "REA", + "DP1.10081.002": "MCT", + "DP1.20086.002": "MCT", + "DP1.20141.002": "MCT", + "DP4.00132.001": "BAT", + } + fvars = pa.Table.from_pandas(frame_file_variables) ftab = fvars.filter(pa.compute.field("table") == fdict[dpid]) fpkgvar = ftab.to_pandas() fschema = get_variables(fpkgvar) - + if cloud_mode: gcs = fs.GcsFileSystem(anonymous=True) - framebuckets = [re.sub(pattern="https://storage.neonscience.org/", - repl="", string=b) for b in framefiles] - fdat = dataset.dataset(source=framebuckets, filesystem=gcs, - format="csv", schema=fschema) + framebuckets = [ + re.sub(pattern="https://storage.neonscience.org/", repl="", string=b) + for b in framefiles + ] + fdat = dataset.dataset( + source=framebuckets, filesystem=gcs, format="csv", schema=fschema + ) else: - fdat = dataset.dataset(source=framefiles, format="csv", - schema=fschema) + fdat = dataset.dataset(source=framefiles, format="csv", schema=fschema) try: fdattab = fdat.to_table() except Exception: stringschema = unknown_string_schema(fdat.head(num_rows=0).column_names) if cloud_mode: - fdat = dataset.dataset(source=framebuckets, filesystem=gcs, - format="csv", schema=stringschema) + fdat = dataset.dataset( + source=framebuckets, filesystem=gcs, format="csv", schema=stringschema + ) else: - fdat = dataset.dataset(source=framefiles, - format="csv", schema=stringschema) + fdat = dataset.dataset(source=framefiles, format="csv", schema=stringschema) fdattab = fdat.to_table() - logging.info("Large file schema did not match expectations; all variable types set to string.") - + logging.info( + "Large file schema did not match expectations; all variable types set to string." + ) + fpdat = fdattab.to_pandas() - + nm = "per_sample" - + if dpid == "DP1.20190.001": nm = "rea_conductivityRawData" elif dpid == "DP1.20193.001": @@ -611,24 +661,23 @@ def stack_frame_files(framefiles, dpid, nm = "fsp_rawSpectra" elif dpid == "DP4.00132.001": nm = "bat_processedSonarFile" - elif dpid=="DP1.10081.001": + elif dpid == "DP1.10081.001": nm = f"mcc_soilPerSampleTaxonomy_{seqtyp}" - elif dpid=="DP1.20086.001": + elif dpid == "DP1.20086.001": nm = f"mcc_benthicPerSampleTaxonomy_{seqtyp}" - elif dpid=="DP1.20141.001": + elif dpid == "DP1.20141.001": nm = f"mcc_surfaceWaterPerSampleTaxonomy_{seqtyp}" - elif dpid=="DP1.10081.002": + elif dpid == "DP1.10081.002": nm = f"mct_soilPerSampleTaxonomy_{seqtyp}" - elif dpid=="DP1.20086.002": + elif dpid == "DP1.20086.002": nm = f"mct_benthicPerSampleTaxonomy_{seqtyp}" - elif dpid=="DP1.20141.002": + elif dpid == "DP1.20141.002": nm = f"mct_surfaceWaterPerSampleTaxonomy_{seqtyp}" - - return {"frmdat":fpdat, "frmnm":nm} + + return {"frmdat": fpdat, "frmnm": nm} -def format_readme(readmetab, - tables): +def format_readme(readmetab, tables): """ Remove site-specific information from the most recently published readme file in the download, and add generic information about the neonutilities download. @@ -650,34 +699,43 @@ def format_readme(readmetab, rd = readmetab if len(tables) > 0: # replace query specific text - dpackind = rd[0].str.contains('CONTENTS').idxmax() - rd.loc[dpackind + 2, 0] = f"This data product contains up to {len(tables)} data tables:" - rd.loc[dpackind + 3: dpackind + 3 + len(tables)-1, 0] = tables - rd.loc[dpackind + 3 + len(tables), 0] = "If data are unavailable for the particular sites and dates queried, some tables may be absent." - qind = rd[0].str.contains('QUERY').idxmax() - downpackind = rd[0].str.contains('Basic download package').idxmax() + dpackind = rd[0].str.contains("CONTENTS").idxmax() + rd.loc[ + dpackind + 2, 0 + ] = f"This data product contains up to {len(tables)} data tables:" + rd.loc[dpackind + 3 : dpackind + 3 + len(tables) - 1, 0] = tables + rd.loc[ + dpackind + 3 + len(tables), 0 + ] = "If data are unavailable for the particular sites and dates queried, some tables may be absent." + qind = rd[0].str.contains("QUERY").idxmax() + downpackind = rd[0].str.contains("Basic download package").idxmax() # Remove specific rows - remove_indices = list(range(qind, dpackind)) + list(range(dpackind + 4 + len(tables), downpackind)) + rd.index[rd[0].str.contains("Date-Time")].tolist() + remove_indices = ( + list(range(qind, dpackind)) + + list(range(dpackind + 4 + len(tables), downpackind)) + + rd.index[rd[0].str.contains("Date-Time")].tolist() + ) remove_indices = [index for index in remove_indices if index in list(rd.index)] rd = rd.drop(remove_indices) # add disclaimer - disclaimer = pd.DataFrame({0: ["###################################", - "########### Disclaimer ############", - "This is the most recent readme publication based on all site-date combinations used during stackByTable.", - "Information specific to the query, including sites and dates, has been removed. The remaining content reflects general metadata for the data product.", - "##################################"]}) + disclaimer = pd.DataFrame( + { + 0: [ + "###################################", + "########### Disclaimer ############", + "This is the most recent readme publication based on all site-date combinations used during stackByTable.", + "Information specific to the query, including sites and dates, has been removed. The remaining content reflects general metadata for the data product.", + "##################################", + ] + } + ) rd = pd.concat([disclaimer, rd], ignore_index=True) - return(rd) + return rd -def stack_data_files_parallel(folder, - package, - dpid, - progress=True, - cloud_mode=False - ): +def stack_data_files_parallel(folder, package, dpid, progress=True, cloud_mode=False): """ Join data files in a unzipped NEON data package by table type @@ -709,46 +767,72 @@ def stack_data_files_parallel(folder, gcs = fs.GcsFileSystem(anonymous=True) else: # Get filenames without full path - filenames = find_datatables(folder = folder, f_names=False) - + filenames = find_datatables(folder=folder, f_names=False) + # Get filenames with full path - filepaths = find_datatables(folder = folder, f_names=True) + filepaths = find_datatables(folder=folder, f_names=True) # dictionary for outputs stacklist = {} # handle per-sample (data frame) tables separately - if dpid in ["DP1.30012.001", "DP1.10081.001", "DP1.20086.001","DP1.20141.001", "DP1.20190.001", - "DP1.20193.001", "DP1.10081.002", "DP1.20086.002","DP1.20141.002", - "DP4.00132.001"] and len([f for f in filenames if not f.startswith("NEON.")]) > 0: - framefiles = [f for f in filepaths if not os.path.basename(f).startswith("NEON.")] + if ( + dpid + in [ + "DP1.30012.001", + "DP1.10081.001", + "DP1.20086.001", + "DP1.20141.001", + "DP1.20190.001", + "DP1.20193.001", + "DP1.10081.002", + "DP1.20086.002", + "DP1.20141.002", + "DP4.00132.001", + ] + and len([f for f in filenames if not f.startswith("NEON.")]) > 0 + ): + framefiles = [ + f for f in filepaths if not os.path.basename(f).startswith("NEON.") + ] filepaths = [f for f in filepaths if os.path.basename(f).startswith("NEON.")] filenames = [f for f in filenames if os.path.basename(f).startswith("NEON.")] - + # stack frame files if progress: - logging.info("Stacking per-sample files. These files may be very large; download data in smaller subsets if performance problems are encountered.\n") + logging.info( + "Stacking per-sample files. These files may be very large; download data in smaller subsets if performance problems are encountered.\n" + ) # subset microbe community data by taxonomic group # and stack both sets - if dpid in ["DP1.10081.001", "DP1.20086.001","DP1.20141.001", - "DP1.10081.002", "DP1.20086.002","DP1.20141.002"]: + if dpid in [ + "DP1.10081.001", + "DP1.20086.001", + "DP1.20141.001", + "DP1.10081.002", + "DP1.20086.002", + "DP1.20141.002", + ]: bacteriafiles = [b for b in framefiles if re.search("[_]16S[_]", b)] fungifiles = [b for b in framefiles if re.search("[_]ITS[_]", b)] - - if len(bacteriafiles)>0: - fpdat16 = stack_frame_files(bacteriafiles, dpid=dpid, - seqtyp="16S", cloud_mode=cloud_mode) + + if len(bacteriafiles) > 0: + fpdat16 = stack_frame_files( + bacteriafiles, dpid=dpid, seqtyp="16S", cloud_mode=cloud_mode + ) stacklist[fpdat16["frmnm"]] = fpdat16["frmdat"] - - if len(fungifiles)>0: - fpdatIT = stack_frame_files(fungifiles, dpid=dpid, - seqtyp="ITS", cloud_mode=cloud_mode) + + if len(fungifiles) > 0: + fpdatIT = stack_frame_files( + fungifiles, dpid=dpid, seqtyp="ITS", cloud_mode=cloud_mode + ) stacklist[fpdatIT["frmnm"]] = fpdatIT["frmdat"] - + else: - fpdat = stack_frame_files(framefiles, dpid=dpid, seqtyp=None, - cloud_mode=cloud_mode) + fpdat = stack_frame_files( + framefiles, dpid=dpid, seqtyp=None, cloud_mode=cloud_mode + ) stacklist[fpdat["frmnm"]] = fpdat["frmdat"] # make a dictionary, where filenames are the keys to the filepath values @@ -773,30 +857,51 @@ def stack_data_files_parallel(folder, # metadata files # get variables and validation files using the most recent publication date - if any(re.search('variables.20', path) for path in filepaths): - varpath = get_recent_publication([path for path in filepaths if "variables.20" in path])[0] + if any(re.search("variables.20", path) for path in filepaths): + varpath = get_recent_publication( + [path for path in filepaths if "variables.20" in path] + )[0] if cloud_mode: - vp = dataset.dataset(source=re.sub("https://storage.googleapis.com/", "", varpath), - filesystem=gcs, format="csv", schema=varschema) + vp = dataset.dataset( + source=re.sub("https://storage.googleapis.com/", "", varpath), + filesystem=gcs, + format="csv", + schema=varschema, + ) va = vp.to_table() v = va.to_pandas() else: - v = pd.read_csv(varpath, sep=',') + v = pd.read_csv(varpath, sep=",") # if science review flags are present but missing from variables file, add variables if "science_review_flags" not in list(v["table"]): if any("science_review_flags" in path for path in filepaths): - science_review_file = (importlib_resources.files(__resources__)/"science_review_variables.csv") - science_review_variables = pd.read_csv(science_review_file, index_col=None) + science_review_file = ( + importlib_resources.files(__resources__) + / "science_review_variables.csv" + ) + science_review_variables = pd.read_csv( + science_review_file, index_col=None + ) v = pd.concat([v, science_review_variables], ignore_index=True) # if sensor positions are present but missing from variables file, add variables if any("sensor_positions" in path for path in filepaths): - sensor_positions_map = (importlib_resources.files(__resources__)/"sensor_positions_variables_mapping.csv") - sensor_positions_internal_variables = pd.read_csv(sensor_positions_map, index_col=None) + sensor_positions_map = ( + importlib_resources.files(__resources__) + / "sensor_positions_variables_mapping.csv" + ) + sensor_positions_internal_variables = pd.read_csv( + sensor_positions_map, index_col=None + ) if "sensor_positions" not in list(v["table"]): - sensor_positions_file = (importlib_resources.files(__resources__)/"sensor_positions_variables.csv") - sensor_positions_variables = pd.read_csv(sensor_positions_file, index_col=None) + sensor_positions_file = ( + importlib_resources.files(__resources__) + / "sensor_positions_variables.csv" + ) + sensor_positions_variables = pd.read_csv( + sensor_positions_file, index_col=None + ) v = pd.concat([v, sensor_positions_variables], ignore_index=True) # save the variables file @@ -804,36 +909,50 @@ def stack_data_files_parallel(folder, stacklist[f"variables_{dpnum}"] = v # get validation file - if any(re.search('validation', path) for path in filepaths): - valpath = get_recent_publication([path for path in filepaths if "validation" in path])[0] + if any(re.search("validation", path) for path in filepaths): + valpath = get_recent_publication( + [path for path in filepaths if "validation" in path] + )[0] if cloud_mode: - vp = dataset.dataset(source=re.sub("https://storage.googleapis.com/", "", valpath), - filesystem=gcs, format="csv", schema=None) + vp = dataset.dataset( + source=re.sub("https://storage.googleapis.com/", "", valpath), + filesystem=gcs, + format="csv", + schema=None, + ) va = vp.to_table() val = va.to_pandas() else: - val = pd.read_csv(valpath, sep=',') + val = pd.read_csv(valpath, sep=",") stacklist[f"validation_{dpnum}"] = val # get categoricalCodes file - if any(re.search('categoricalCodes', path) for path in filepaths): - ccpath = get_recent_publication([path for path in filepaths if "categoricalCodes" in path])[0] + if any(re.search("categoricalCodes", path) for path in filepaths): + ccpath = get_recent_publication( + [path for path in filepaths if "categoricalCodes" in path] + )[0] if cloud_mode: - cp = dataset.dataset(source=re.sub("https://storage.googleapis.com/", "", ccpath), - filesystem=gcs, format="csv", schema=None) + cp = dataset.dataset( + source=re.sub("https://storage.googleapis.com/", "", ccpath), + filesystem=gcs, + format="csv", + schema=None, + ) ca = cp.to_table() cc = ca.to_pandas() else: - cc = pd.read_csv(ccpath, sep=',') + cc = pd.read_csv(ccpath, sep=",") stacklist[f"categoricalCodes_{dpnum}"] = cc # get readme file if cloud_mode: readmefiles = filepaths else: - readmefiles = glob.glob(os.path.join(folder, '**', '*.txt'), recursive=True) + readmefiles = glob.glob(os.path.join(folder, "**", "*.txt"), recursive=True) if any(re.search("readme.20", path) for path in readmefiles): - readmepath = get_recent_publication([path for path in readmefiles if "readme.20" in path])[0] + readmepath = get_recent_publication( + [path for path in readmefiles if "readme.20" in path] + )[0] rd = None if cloud_mode: try: @@ -842,13 +961,13 @@ def stack_data_files_parallel(folder, pass else: try: - rd = pd.read_table(readmepath, delimiter='\t', header=None) + rd = pd.read_table(readmepath, delimiter="\t", header=None) except Exception: pass if rd is None: pass else: - rd = format_readme(rd, tables) + rd = format_readme(rd, tables) # save the readme stacklist[f"readme_{dpnum}"] = rd @@ -856,25 +975,26 @@ def stack_data_files_parallel(folder, if progress: logging.info("Stacking data files") arrowvars = pa.Table.from_pandas(stacklist[f"variables_{dpnum}"]) - for j in tqdm(tables, disable=not progress): - + for j in tqdm(tables, disable=not progress): # create schema from variables file, for only this table and package vtab = arrowvars.filter(pa.compute.field("table") == j) - if len(vtab)==0: + if len(vtab) == 0: vtab = arrowvars.filter(pa.compute.field("table") == j + "_pub") if j == "sensor_positions": vtab = pa.Table.from_pandas(sensor_positions_internal_variables) - if package=="basic": + if package == "basic": vtabpkg = vtab.filter(pa.compute.field("downloadPkg") == "basic") else: vtabpkg = vtab - + tablepkgvar = vtabpkg.to_pandas() - if len(tablepkgvar)==0: + if len(tablepkgvar) == 0: # set to string if variables file can't be found tableschema = None - logging.info(f"Variables file not found for table {j}. Data types will be inferred if possible.") + logging.info( + f"Variables file not found for table {j}. Data types will be inferred if possible." + ) else: tableschema = get_variables(tablepkgvar) @@ -906,20 +1026,22 @@ def stack_data_files_parallel(folder, # read data and append file names if cloud_mode: - tablebuckets = [re.sub(pattern="https://storage.googleapis.com/", - repl="", string=b) for b in tablepaths] - dat = dataset.dataset(source=tablebuckets, filesystem=gcs, - format="csv", schema=tableschema) + tablebuckets = [ + re.sub(pattern="https://storage.googleapis.com/", repl="", string=b) + for b in tablepaths + ] + dat = dataset.dataset( + source=tablebuckets, filesystem=gcs, format="csv", schema=tableschema + ) else: - dat = dataset.dataset(source=tablepaths, - format="csv", schema=tableschema) + dat = dataset.dataset(source=tablepaths, format="csv", schema=tableschema) if tableschema is None: cols = dat.head(num_rows=0).column_names else: cols = tableschema.names cols.append("__filename") - + # attempt to stack to table. if it fails, stack as all string fields and warn stringset = False try: @@ -927,33 +1049,46 @@ def stack_data_files_parallel(folder, except Exception: try: if tableschema is None: - stringschema = unknown_string_schema(dat.head(num_rows=0).column_names) + stringschema = unknown_string_schema( + dat.head(num_rows=0).column_names + ) else: stringschema = string_schema(tablepkgvar) if cloud_mode: - dat = dataset.dataset(source=tablebuckets, filesystem=gcs, - format="csv", schema=stringschema) + dat = dataset.dataset( + source=tablebuckets, + filesystem=gcs, + format="csv", + schema=stringschema, + ) else: - dat = dataset.dataset(source=tablepaths, - format="csv", schema=stringschema) + dat = dataset.dataset( + source=tablepaths, format="csv", schema=stringschema + ) dattab = dat.to_table(columns=cols) - logging.info(f"Table {j} schema did not match data; all variable types set to string. Data type casting will be attempted after stacking step.") + logging.info( + f"Table {j} schema did not match data; all variable types set to string. Data type casting will be attempted after stacking step." + ) stringset = True except Exception: - logging.info(f"Failed to stack table {j}. Check input data and variables file.") + logging.info( + f"Failed to stack table {j}. Check input data and variables file." + ) continue - + pdat = dattab.to_pandas() if stringset: try: pdat = cast_table_neon(pdat, tablepkgvar) except Exception: - logging.info(f"Data type casting failed for table {j}. Variable types set to string.") + logging.info( + f"Data type casting failed for table {j}. Variable types set to string." + ) # append publication date pubr = re.compile("20[0-9]{6}T[0-9]{6}Z") pubval = [pubr.search(os.path.basename(p)).group(0) for p in pdat["__filename"]] - pdat = pdat.assign(publicationDate = pubval) + pdat = pdat.assign(publicationDate=pubval) # append release tag if cloud_mode: @@ -962,14 +1097,16 @@ def stack_data_files_parallel(folder, else: pubrelr = re.compile("20[0-9]{6}T[0-9]{6}Z\\..*\\/") pubrelval = [pubrelr.search(p).group(0) for p in pdat["__filename"]] - relval = [re.sub(".*\\.","",s) for s in pubrelval] - relval = [re.sub("\\/","",s) for s in relval] - pdat = pdat.assign(release = relval) + relval = [re.sub(".*\\.", "", s) for s in pubrelval] + relval = [re.sub("\\/", "", s) for s in relval] + pdat = pdat.assign(release=relval) releases.append(list(set(relval))) # append fields to variables file if f"variables_{dpnum}" in stacklist.keys(): - added_fields_file = (importlib_resources.files(__resources__)/"added_fields.csv") + added_fields_file = ( + importlib_resources.files(__resources__) / "added_fields.csv" + ) added_fields = pd.read_csv(added_fields_file, index_col=None) added_fields_all = added_fields[-2:] added_fields_all.insert(0, "table", j) @@ -980,18 +1117,18 @@ def stack_data_files_parallel(folder, # for IS products, append domainID, siteID, HOR, VER if "siteID" not in pdat.columns.to_list() and not table_types[j] == "lab": - dr = re.compile("D[0-2]{1}[0-9]{1}") domval = [dr.search(d).group(0) for d in pdat["__filename"]] pdat.insert(0, "domainID", domval) sr = re.compile("D[0-9]{2}[.][A-Z]{4}[.]") sitel = [sr.search(s).group(0) for s in pdat["__filename"]] - siteval = [re.sub(pattern="D[0-9]{2}[.]|[.]", repl="", string=s) for s in sitel] + siteval = [ + re.sub(pattern="D[0-9]{2}[.]|[.]", repl="", string=s) for s in sitel + ] pdat.insert(1, "siteID", siteval) if j != "sensor_positions": - locr = re.compile("[.][0-9]{3}[.][0-9]{3}[.][0-9]{3}[.][0-9]{3}[.]") indtemp = [locr.search(l) for l in pdat["__filename"]] if None in indtemp: @@ -1002,16 +1139,18 @@ def stack_data_files_parallel(folder, ver = [indx[9:12] for indx in indxs] pdat.insert(2, "horizontalPosition", hor) pdat.insert(3, "verticalPosition", ver) - + # sort table rows pdat = sort_dat(pdat) # append fields to variables file if f"variables_{dpnum}" in stacklist.keys(): added_fields_IS = added_fields[0:4] - added_fields_IS.insert(0,"table",j) + added_fields_IS.insert(0, "table", j) try: - vlist[j] = pd.concat([added_fields_IS, vlist[j]], ignore_index=True) + vlist[j] = pd.concat( + [added_fields_IS, vlist[j]], ignore_index=True + ) except Exception: pass @@ -1024,7 +1163,7 @@ def stack_data_files_parallel(folder, pdat = remove_srf_dups(pdat) # for sensor position files, align column names - if j =="sensor_positions": + if j == "sensor_positions": pdat = align_sp_cols(pdat) # remove filename column @@ -1053,31 +1192,34 @@ def stack_data_files_parallel(folder, releases = list(set(releases)) if "PROVISIONAL" in releases: try: - stacklist[f"citation_{dpnum}_PROVISIONAL"] = get_citation(dpid=dpid, release="PROVISIONAL") + stacklist[f"citation_{dpnum}_PROVISIONAL"] = get_citation( + dpid=dpid, release="PROVISIONAL" + ) except Exception: pass relr = re.compile("RELEASE-20[0-9]{2}") rs = [relr.search(r).group(0) for r in releases if relr.search(r)] if len(rs) == 1: - stacklist[f"citation_{dpnum}_{rs[0]}"] = get_citation(dpid=dpid, release=rs[0]) + stacklist[f"citation_{dpnum}_{rs[0]}"] = get_citation( + dpid=dpid, release=rs[0] + ) if len(rs) > 1: - logging.info("Multiple data releases were stacked together. This is not appropriate, check your input data.") + logging.info( + "Multiple data releases were stacked together. This is not appropriate, check your input data." + ) except Exception: pass return stacklist -def stack_by_table(filepath, - savepath=None, - save_unzipped_files=False, - progress=True, - cloud_mode=False - ): +def stack_by_table( + filepath, savepath=None, save_unzipped_files=False, progress=True, cloud_mode=False +): """ - Join data files in a zipped or unzipped NEON data package by table type. - This function can be used on a zip file downloaded from the NEON data portal or + Join data files in a zipped or unzipped NEON data package by table type. + This function can be used on a zip file downloaded from the NEON data portal or on a set of files downloaded by zips_by_product(). Parameters @@ -1086,7 +1228,7 @@ def stack_by_table(filepath, The location of the zip file or downloaded files. savepath: str, optional - The location to save the output files to. If omitted, output files will be + The location to save the output files to. If omitted, output files will be saved in the same location as the input file. save_unzipped_files: bool, optional @@ -1096,15 +1238,15 @@ def stack_by_table(filepath, Should the function display progress bars as it runs? Defaults to True. cloud_mode: bool, optional - Use cloud mode to transfer files cloud-to-cloud? If used, stack_by_table() - expects a list of file urls as input. Defaults to False; in general this - option should be used via load_by_product(), in which stack_by_table() is a + Use cloud mode to transfer files cloud-to-cloud? If used, stack_by_table() + expects a list of file urls as input. Defaults to False; in general this + option should be used via load_by_product(), in which stack_by_table() is a helper function. Return ------------------- - All files are unzipped and one file for each table type is created and written. - If savepath="envt" is specified, output is a named list of tables; otherwise, + All files are unzipped and one file for each table type is created and written. + If savepath="envt" is specified, output is a named list of tables; otherwise, function output is null and files are saved to the location specified. Example @@ -1117,11 +1259,11 @@ def stack_by_table(filepath, -------- Windows Path Length Limitations: - When using this function, you may encounter path length limitations on Windows systems. + When using this function, you may encounter path length limitations on Windows systems. Windows has a default maximum path length of 260 characters, which can cause download and unzip - functions to fail if this limit is exceeded. If the file path exceeds 260 characters on a Windows system, - the package will issue a warning.You can choose to ignore or filter these warnings using Python's warnings - module if you prefer not to see them. If the function is unable to unzip a folder due to path length + functions to fail if this limit is exceeded. If the file path exceeds 260 characters on a Windows system, + the package will issue a warning.You can choose to ignore or filter these warnings using Python's warnings + module if you prefer not to see them. If the function is unable to unzip a folder due to path length limitations, an OSError will be raised. Created on Tue Mar 5 2024 @@ -1134,32 +1276,36 @@ def stack_by_table(filepath, files = filepath[0] else: # Is the filepath input a zip file or an unzipped file? - if filepath[-4:] in ['.zip', '.ZIP']: + if filepath[-4:] in [".zip", ".ZIP"]: folder = False else: folder = True # Get list of files nested (and/or zipped) in filepath if not folder: - zip_ref = zipfile.ZipFile(filepath, 'r') + zip_ref = zipfile.ZipFile(filepath, "r") files = zip_ref.namelist() else: - files = glob.glob(filepath + '/**', recursive=True) + files = glob.glob(filepath + "/**", recursive=True) # Error handling if there are no standardized NEON Portal data tables in the list of files - if not any(re.search(r'NEON.D[0-9]{2}.[A-Z]{4}.', x) for x in files): - logging.info('Data files are not present in the specified filepath.') + if not any(re.search(r"NEON.D[0-9]{2}.[A-Z]{4}.", x) for x in files): + logging.info("Data files are not present in the specified filepath.") return # Determine dpid # this regexpr allows for REV = .001 or .002 dpid_listlist = [] for f in range(len(files)): - dpid_listlist.append(re.findall(re.compile("DP[1-4][.][0-9]{5}[.]00[1-2]{1}"), files[f])) + dpid_listlist.append( + re.findall(re.compile("DP[1-4][.][0-9]{5}[.]00[1-2]{1}"), files[f]) + ) dpid = [x for dpid_list in dpid_listlist for x in dpid_list] dpid = list(set(dpid)) if not len(dpid) == 1: - logging.info("Data product ID could not be determined. Check that filepath contains data files, from a single NEON data product.") + logging.info( + "Data product ID could not be determined. Check that filepath contains data files, from a single NEON data product." + ) return else: dpid = dpid[0] @@ -1170,39 +1316,50 @@ def stack_by_table(filepath, package_listlist.append(re.findall(re.compile("basic|expanded"), files[f])) package = [x for package_list in package_listlist for x in package_list] package = list(set(package)) - if 'expanded' in package: - package = 'expanded' + if "expanded" in package: + package = "expanded" else: - package = 'basic' + package = "basic" # Error message for AOP data - if dpid[4] == '3' and not dpid == 'DP1.30012.001': - logging.info("This is an AOP data product, files cannot be stacked. Use by_file_aop() or by_tile_aop() to download data.") + if dpid[4] == "3" and not dpid == "DP1.30012.001": + logging.info( + "This is an AOP data product, files cannot be stacked. Use by_file_aop() or by_tile_aop() to download data." + ) return # Error messafe for SAE data - if dpid == 'DP4.00200.001': - logging.info("This eddy covariance data product is in HDF5 format. Stack using the stackEddy() function in the R package version of neonUtilities.") + if dpid == "DP4.00200.001": + logging.info( + "This eddy covariance data product is in HDF5 format. Stack using the stackEddy() function in the R package version of neonUtilities." + ) return # Exceptions for digital hemispheric photos - if dpid == 'DP1.10017.001' and package == 'expanded': + if dpid == "DP1.10017.001" and package == "expanded": save_unzipped_files = True - logging.info("Note: Digital hemispheric photos (in NEF format) cannot be stacked; only the CSV metadata files will be stacked.") + logging.info( + "Note: Digital hemispheric photos (in NEF format) cannot be stacked; only the CSV metadata files will be stacked." + ) # Warning about all sensor soil data # Test and modify the file length for the alert, this should be a lot better with arrow - if dpid in ['DP1.00094.001','DP1.00041.001'] and len(files) > 24: - logging.info("Warning! Attempting to stack soil sensor data. Note that due to the number of soil sensors at each site, data volume is very high for these data. Consider dividing data processing into chunks and/or using a high-performance system.") + if dpid in ["DP1.00094.001", "DP1.00041.001"] and len(files) > 24: + logging.info( + "Warning! Attempting to stack soil sensor data. Note that due to the number of soil sensors at each site, data volume is very high for these data. Consider dividing data processing into chunks and/or using a high-performance system." + ) # If all checks pass, unzip and stack files if cloud_mode: - stackedlist = stack_data_files_parallel(folder=filepath, package=package, - dpid=dpid, progress=progress, - cloud_mode=True) + stackedlist = stack_data_files_parallel( + folder=filepath, + package=package, + dpid=dpid, + progress=progress, + cloud_mode=True, + ) else: - # If the filepath is a zip file if not folder: unzip_zipfile(zippath=filepath) @@ -1215,20 +1372,19 @@ def stack_by_table(filepath, stackpath = filepath # Stack the files - stackedlist = stack_data_files_parallel(folder=stackpath, - package=package, - dpid=dpid, - progress=progress) + stackedlist = stack_data_files_parallel( + folder=stackpath, package=package, dpid=dpid, progress=progress + ) # delete input files if not save_unzipped_files: - ufl = glob.glob(stackpath+"/**.*/*", recursive=True) + ufl = glob.glob(stackpath + "/**.*/*", recursive=True) for fl in ufl: try: os.remove(fl) except Exception: pass - dirlist = glob.glob(stackpath+"/*", recursive=True) + dirlist = glob.glob(stackpath + "/*", recursive=True) for d in dirlist: try: os.rmdir(d) @@ -1265,77 +1421,90 @@ def stack_by_table(filepath, for k in stackedlistsort.keys(): tk = stackedlistsort[k] if "citation" in k: - with open(f"{stacked_files_dir}/{k}.txt", - mode="w+", encoding="utf-8") as f: + with open( + f"{stacked_files_dir}/{k}.txt", mode="w+", encoding="utf-8" + ) as f: f.write(tk) else: if "readme" in k: - tk.to_csv(f"{stacked_files_dir}/{k}.txt", - sep="\t", index=False) + tk.to_csv(f"{stacked_files_dir}/{k}.txt", sep="\t", index=False) else: - # Write numeric data to up to 15 digits, rounding to - # the precision just below the maximum. This mimics the - # default behavior in R, to ensure the two languages' - # versions of neonUtilities match. For an interesting - # discussion of the issues around this, see + # Write numeric data to up to 15 digits, rounding to + # the precision just below the maximum. This mimics the + # default behavior in R, to ensure the two languages' + # versions of neonUtilities match. For an interesting + # discussion of the issues around this, see # https://github.com/pandas-dev/pandas/issues/16452 - tk.to_csv(f"{stacked_files_dir}/{k}.csv", - index=False, float_format="%.15g") + tk.to_csv( + f"{stacked_files_dir}/{k}.csv", + index=False, + float_format="%.15g", + ) return None -def load_by_product(dpid, site="all", startdate=None, enddate=None, - package="basic", release="current", - timeindex="all", tabl="all", check_size=True, - include_provisional=False, cloud_mode=False, - progress=True, token=None): +def load_by_product( + dpid, + site="all", + startdate=None, + enddate=None, + package="basic", + release="current", + timeindex="all", + tabl="all", + check_size=True, + include_provisional=False, + cloud_mode=False, + progress=True, + token=None, +): """ - This function downloads product-site-month data package files from NEON, unzips + This function downloads product-site-month data package files from NEON, unzips and stacks the data files, and loads to the environment. Parameters ---------------- dpid: str Data product identifier in the form DP#.#####.### - + site: str Either the string 'all', or one or more 4-letter NEON site codes. Defaults to 'all'. - + startdate: str, optional Earliest date of data to download, in the form YYYY-MM - + enddate: str, optional Latest date of data to download, in the form YYYY-MM - + package: str, optional Download package to access, either basic or expanded. Defaults to 'basic'. - + release: str, optional Data release to download. Defaults to the most recent release. - + timeindex: str, optional Either 'all', or the time index of data to download, in minutes. Only applicable to sensor (IS) data. Defaults to 'all'. - + tabl: str, optional Either 'all', or the name of a single data table to download. Only applicable to observational (OS) data. Defaults to 'all'. - + check_size: bool, optional True or False, should the user approve the total file size before downloading? Defaults to True. When working in batch mode, or other non-interactive workflow, use check_size=False. - + include_provisional: bool, optional - Should Provisional data be returned in the download? Defaults to False. See - https://www.neonscience.org/data-samples/data-management/data-revisions-releases + Should Provisional data be returned in the download? Defaults to False. See + https://www.neonscience.org/data-samples/data-management/data-revisions-releases for details on the difference between provisional and released data. - + cloud_mode: bool, optional Use cloud mode to transfer files cloud-to-cloud? Should only be used if the destination location is in the cloud. Defaults to False. - + progress: bool, optional Should the function display progress bars as it runs? Defaults to True. - - token: User-specific API token from data.neonscience.org user account. See - https://data.neonscience.org/data-api/rate-limiting/ for details about + + token: User-specific API token from data.neonscience.org user account. See + https://data.neonscience.org/data-api/rate-limiting/ for details about API rate limits and user tokens. If omitted, download uses the public rate limit. Return @@ -1353,11 +1522,11 @@ def load_by_product(dpid, site="all", startdate=None, enddate=None, Notes -------- Windows Path Length Limitations: - When using this function, you may encounter path length limitations on Windows systems. + When using this function, you may encounter path length limitations on Windows systems. Windows has a default maximum path length of 260 characters, which can cause download and unzip - functions to fail if this limit is exceeded. If the file path exceeds 260 characters on a Windows system, - the package will issue a warning.You can choose to ignore or filter these warnings using Python's warnings - module if you prefer not to see them. If the function is unable to unzip a folder due to path length + functions to fail if this limit is exceeded. If the file path exceeds 260 characters on a Windows system, + the package will issue a warning.You can choose to ignore or filter these warnings using Python's warnings + module if you prefer not to see them. If the function is unable to unzip a folder due to path length limitations, an OSError will be raised. Created on June 12 2024 @@ -1368,34 +1537,55 @@ def load_by_product(dpid, site="all", startdate=None, enddate=None, savepath = os.getcwd() if cloud_mode: - flist = zips_by_product(dpid=dpid, site=site, - startdate=startdate, enddate=enddate, - package=package, release=release, - timeindex=timeindex, tabl=tabl, - check_size=check_size, - include_provisional=include_provisional, - cloud_mode=True, - progress=progress, token=token, - savepath=savepath) - - outlist = stack_by_table(filepath=flist, savepath="envt", - cloud_mode=True, - save_unzipped_files=False, - progress=progress) + flist = zips_by_product( + dpid=dpid, + site=site, + startdate=startdate, + enddate=enddate, + package=package, + release=release, + timeindex=timeindex, + tabl=tabl, + check_size=check_size, + include_provisional=include_provisional, + cloud_mode=True, + progress=progress, + token=token, + savepath=savepath, + ) + + outlist = stack_by_table( + filepath=flist, + savepath="envt", + cloud_mode=True, + save_unzipped_files=False, + progress=progress, + ) else: - zips_by_product(dpid=dpid, site=site, - startdate=startdate, enddate=enddate, - package=package, release=release, - timeindex=timeindex, tabl=tabl, - check_size=check_size, - include_provisional=include_provisional, - progress=progress, token=token, - savepath=savepath) + zips_by_product( + dpid=dpid, + site=site, + startdate=startdate, + enddate=enddate, + package=package, + release=release, + timeindex=timeindex, + tabl=tabl, + check_size=check_size, + include_provisional=include_provisional, + progress=progress, + token=token, + savepath=savepath, + ) stackpath = savepath + "/filesToStack" + dpid[4:9] + "/" - outlist = stack_by_table(filepath=stackpath, savepath="envt", - save_unzipped_files=False, progress=progress) + outlist = stack_by_table( + filepath=stackpath, + savepath="envt", + save_unzipped_files=False, + progress=progress, + ) return outlist diff --git a/tests/test_aop_download.py b/tests/test_aop_download.py index f4791cf..323e456 100644 --- a/tests/test_aop_download.py +++ b/tests/test_aop_download.py @@ -32,7 +32,10 @@ from parameterized import parameterized # import neon utilities functions that are being tested -from src.neonutilities.aop_download import by_file_aop, by_tile_aop, list_available_dates +from src.neonutilities.aop_download import ( + by_file_aop, + by_tile_aop, +) # read in token from os.environ (requires the token to be set) token = os.environ.get("NEON_TOKEN") @@ -55,7 +58,10 @@ def test_invalid_dpid_format(self): Test that invalid dpid format raises ValueError and message displays correctly. """ invalid_dpid = "DP1.30001" - with self.assertRaises(ValueError, msg=f'{invalid_dpid} is not a properly formatted data product ID. The correct format is DP#.#####.00#'): + with self.assertRaises( + ValueError, + msg=f"{invalid_dpid} is not a properly formatted data product ID. The correct format is DP#.#####.00#", + ): by_file_aop(dpid=invalid_dpid, site=self.site, year=self.year) def test_invalid_aop_dpid_pattern(self): @@ -63,7 +69,10 @@ def test_invalid_aop_dpid_pattern(self): Test that invalid AOP dpid pattern raises a ValueError and message displays correctly. """ invalid_aop_dpid = "DP1.20001.001" - with self.assertRaises(ValueError, msg=f'{invalid_aop_dpid} is not a valid AOP data product ID. AOP products follow the format DP#.300##.00#'): + with self.assertRaises( + ValueError, + msg=f"{invalid_aop_dpid} is not a valid AOP data product ID. AOP products follow the format DP#.300##.00#", + ): by_file_aop(dpid=invalid_aop_dpid, site=self.site, year=self.year) def test_invalid_aop_dpid_suspended(self): @@ -72,25 +81,32 @@ def test_invalid_aop_dpid_suspended(self): """ suspended_aop_dpid = "DP2.30016.001" # ' Valid AOP DPIDs are '): - with self.assertRaises(ValueError, msg=f'{suspended_aop_dpid} has been suspended and is not currently available, see https://www.neonscience.org/data-products/{suspended_aop_dpid} for more details.'): - by_file_aop(dpid=suspended_aop_dpid, - site=self.site, year=self.year) + with self.assertRaises( + ValueError, + msg=f"{suspended_aop_dpid} has been suspended and is not currently available, see https://www.neonscience.org/data-products/{suspended_aop_dpid} for more details.", + ): + by_file_aop(dpid=suspended_aop_dpid, site=self.site, year=self.year) def test_check_field_spectra_dpid(self): """ Test that providing field spectra dpid raises ValueError and message displays correctly. """ - field_spectra_dpid = 'DP1.30012.001' - with self.assertRaises(ValueError, msg=f'{field_spectra_dpid} is the Field spectral data product, which is published as tabular data. Use zipsByProduct() or loadByProduct() to download these data.'): - by_file_aop(dpid=field_spectra_dpid, - site=self.site, year=self.year) + field_spectra_dpid = "DP1.30012.001" + with self.assertRaises( + ValueError, + msg=f"{field_spectra_dpid} is the Field spectral data product, which is published as tabular data. Use zipsByProduct() or loadByProduct() to download these data.", + ): + by_file_aop(dpid=field_spectra_dpid, site=self.site, year=self.year) def test_invalid_site_format(self): """ Test that invalid site format raises ValueError and message displays correctly. """ - invalid_site = 'McRae' - with self.assertRaises(ValueError, msg=f'{invalid_site} is an invalid site format. A four-letter NEON site code is required. NEON sites codes can be found here: https://www.neonscience.org/field-sites/explore-field-sites'): + invalid_site = "McRae" + with self.assertRaises( + ValueError, + msg=f"{invalid_site} is an invalid site format. A four-letter NEON site code is required. NEON sites codes can be found here: https://www.neonscience.org/field-sites/explore-field-sites", + ): by_file_aop(dpid=self.dpid, site=invalid_site, year=self.year) def test_invalid_neon_site(self): @@ -98,62 +114,85 @@ def test_invalid_neon_site(self): Test that an invalid NEON site code raises a ValueError and the message displays correctly. """ invalid_site = "HOPD" - with self.assertRaises(ValueError, msg=f'{invalid_site} is not a valid NEON site code. A complete list of NEON site codes can be found here: https://www.neonscience.org/field-sites/explore-field-sites'): + with self.assertRaises( + ValueError, + msg=f"{invalid_site} is not a valid NEON site code. A complete list of NEON site codes can be found here: https://www.neonscience.org/field-sites/explore-field-sites", + ): by_file_aop(dpid=self.dpid, site=invalid_site, year=self.year) - @parameterized.expand([("21",), ("2021-05",), ]) + @parameterized.expand( + [ + ("21",), + ("2021-05",), + ] + ) def test_invalid_year_format(self, year): """ Test that invalid year format raises ValueError and message displays correctly. """ - with pytest.raises(ValueError, match=f'{year} is an invalid year. Year is required in the format "2017" or 2017, eg. AOP data are available from 2013 to present.'): + with pytest.raises( + ValueError, + match=f'{year} is an invalid year. Year is required in the format "2017" or 2017, eg. AOP data are available from 2013 to present.', + ): by_file_aop(dpid=self.dpid, site=self.site, year=year) - @parameterized.expand([ - ("2022", "CHEQ", "STEI"), - ("2022", "TREE", "STEI"), - ("2021", "DCFS", "WOOD"), - ("2020", "KONA", "KONZ"), - ], name_func=lambda f, n, p: f'{f.__name__}_{p.args[1]}_{p.args[0]}') + @parameterized.expand( + [ + ("2022", "CHEQ", "STEI"), + ("2022", "TREE", "STEI"), + ("2021", "DCFS", "WOOD"), + ("2020", "KONA", "KONZ"), + ], + name_func=lambda f, n, p: f"{f.__name__}_{p.args[1]}_{p.args[0]}", + ) # the name_func displays a more descriptive test name when running the tests - @patch('builtins.input', return_value='n') - def test_collocated_terrestrial_site_message(self, year, site, flightSite, input_mock): + @patch("builtins.input", return_value="n") + def test_collocated_terrestrial_site_message( + self, year, site, flightSite, input_mock + ): """ Test application of the terrestrial collocated site lookup, and expected message display. """ - with self.assertLogs(level='INFO') as cm: - by_file_aop(dpid="DP3.30015.001", site=site, - year=year, token=token) + with self.assertLogs(level="INFO") as cm: + by_file_aop(dpid="DP3.30015.001", site=site, year=year, token=token) self.assertIn( - f'INFO:root:{site} is part of the flight box for {flightSite}. Downloading data from {flightSite}.', cm.output) - - @parameterized.expand([ - ("2018", "BARC", "OSBS"), - ("2020", "COMO", "NIWO"), - # ("2021", "BLDE", "YELL"), - # ("2020", "KING", "KONZ"), - ], name_func=lambda f, n, p: f'{f.__name__}_{p.args[1]}_{p.args[0]}') - @patch('builtins.input', return_value='n') + f"INFO:root:{site} is part of the flight box for {flightSite}. Downloading data from {flightSite}.", + cm.output, + ) + + @parameterized.expand( + [ + ("2018", "BARC", "OSBS"), + ("2020", "COMO", "NIWO"), + # ("2021", "BLDE", "YELL"), + # ("2020", "KING", "KONZ"), + ], + name_func=lambda f, n, p: f"{f.__name__}_{p.args[1]}_{p.args[0]}", + ) + @patch("builtins.input", return_value="n") def test_collocated_aquatic_site_message(self, year, site, flightSite, input_mock): """ Test application of the aquatic collocated site lookup, and expected message display. """ - with self.assertLogs(level='INFO') as cm: - by_file_aop(dpid=self.dpid, site=site, - year=year, token=token) + with self.assertLogs(level="INFO") as cm: + by_file_aop(dpid=self.dpid, site=site, year=year, token=token) self.assertIn( - f'INFO:root:{site} is an aquatic site and is sometimes included in the flight box for {flightSite}. Aquatic sites are not always included in the flight coverage every year.\nDownloading data from {flightSite}. Check data to confirm coverage of {site}.', cm.output) + f"INFO:root:{site} is an aquatic site and is sometimes included in the flight box for {flightSite}. Aquatic sites are not always included in the flight coverage every year.\nDownloading data from {flightSite}. Check data to confirm coverage of {site}.", + cm.output, + ) def test_no_data_available_message(self): """ Test that the by_file_aop() function returns the expected error log when no data is available for a selected site and year. """ - with self.assertLogs(level='INFO') as cm: + with self.assertLogs(level="INFO") as cm: by_file_aop(dpid="DP3.30015.001", site=self.site, year=2020) self.assertIn( - f'INFO:root:There are no {self.dpid} data available at the site {self.site} in 2020.\nTo display available dates for a given data product and site, use the function list_available_dates().', cm.output) + f"INFO:root:There are no {self.dpid} data available at the site {self.site} in 2020.\nTo display available dates for a given data product and site, use the function list_available_dates().", + cm.output, + ) - @patch('builtins.input', return_value='n') + @patch("builtins.input", return_value="n") def test_check_download_size_message(self, input_mock): """ Test that download check_size message displays correctly. @@ -161,7 +200,8 @@ def test_check_download_size_message(self, input_mock): result = by_file_aop(dpid=self.dpid, site=self.site, year=self.year) # Check that the function asked for confirmation to download and prints expected message. input_mock.assert_called_once_with( - 'Continuing will download 128 files totaling approximately 93.1 MB. Do you want to proceed? (y/n) ') + "Continuing will download 128 files totaling approximately 93.1 MB. Do you want to proceed? (y/n) " + ) # Check that the function halted the download self.assertEqual(result, None) @@ -173,23 +213,28 @@ def test_all_provisional_no_data_available_message(self): Test that the by_file_aop() function returns the expected message when include_provisional is set to False (default) but no data are available. This has already run through the check that any data is available(eg. there is data at that site for the year provided) """ - with self.assertLogs(level='INFO') as cm: + with self.assertLogs(level="INFO") as cm: by_file_aop(dpid="DP3.30015.001", site="WLOU", year=2024) self.assertIn( - 'INFO:root:No data files found. Available data may all be provisional. To download provisional data, use input parameter include_provisional=True.', cm.output) + "INFO:root:No data files found. Available data may all be provisional. To download provisional data, use input parameter include_provisional=True.", + cm.output, + ) # provisional included, and no data available - @patch('builtins.input', return_value='n') + @patch("builtins.input", return_value="n") def test_provisional_included_and_data_available_message(self, input_mock): """ Test that the by_file_aop() function returns the expected message when include_provisional is set to False (default) but no data are available. This has already run through the check that any data is available(eg. there is data at that site for the year provided) """ - with self.assertLogs(level='INFO') as cm: - by_file_aop(dpid="DP3.30015.001", site="WLOU", - year=2024, include_provisional=True) + with self.assertLogs(level="INFO") as cm: + by_file_aop( + dpid="DP3.30015.001", site="WLOU", year=2024, include_provisional=True + ) self.assertIn( - 'INFO:root:Provisional data are included. To exclude provisional data, use input parameter include_provisional=False.', cm.output) + "INFO:root:Provisional data are included. To exclude provisional data, use input parameter include_provisional=False.", + cm.output, + ) # other scenarios- check messages but don't download the data ? # provisional not included, and data available @@ -208,112 +253,195 @@ def setUp(self): def test_invalid_dpid_format(self): invalid_dpid = "DP1.30001" - with self.assertRaises(ValueError, msg=f'{invalid_dpid} is not a properly formatted data product ID. The correct format is DP#.#####.00#'): - by_tile_aop(dpid=invalid_dpid, site=self.site, year=self.year, - easting=self.easting, northing=self.northing) + with self.assertRaises( + ValueError, + msg=f"{invalid_dpid} is not a properly formatted data product ID. The correct format is DP#.#####.00#", + ): + by_tile_aop( + dpid=invalid_dpid, + site=self.site, + year=self.year, + easting=self.easting, + northing=self.northing, + ) def test_invalid_aop_l3_dpid(self): """ Test that invalid AOP dpid raises a ValueError and message displays correctly. """ invalid_aop_dpid = "DP1.30001.001" - with self.assertRaises(ValueError, msg=f'{invalid_aop_dpid} is not a valid Level 3 AOP data product ID. Level 3 AOP products follow the format DP3.300##.00#'): - by_tile_aop(dpid=invalid_aop_dpid, site=self.site, year=self.year, - easting=self.easting, northing=self.northing) + with self.assertRaises( + ValueError, + msg=f"{invalid_aop_dpid} is not a valid Level 3 AOP data product ID. Level 3 AOP products follow the format DP3.300##.00#", + ): + by_tile_aop( + dpid=invalid_aop_dpid, + site=self.site, + year=self.year, + easting=self.easting, + northing=self.northing, + ) def test_check_field_spectra_dpid(self): - field_spectra_dpid = 'DP1.30012.001' - with self.assertRaises(ValueError, msg=f'{field_spectra_dpid} is the Field spectral data product, which is published as tabular data. Use zipsByProduct() or loadByProduct() to download these data.'): - by_tile_aop(dpid=field_spectra_dpid, site=self.site, year=self.year, - easting=self.easting, northing=self.northing) + field_spectra_dpid = "DP1.30012.001" + with self.assertRaises( + ValueError, + msg=f"{field_spectra_dpid} is the Field spectral data product, which is published as tabular data. Use zipsByProduct() or loadByProduct() to download these data.", + ): + by_tile_aop( + dpid=field_spectra_dpid, + site=self.site, + year=self.year, + easting=self.easting, + northing=self.northing, + ) def test_invalid_site_format(self): - invalid_site = 'McRae' - with self.assertRaises(ValueError, msg=f'{invalid_site} is an invalid site format. A four-letter NEON site code is required. NEON sites codes can be found here: https://www.neonscience.org/field-sites/explore-field-sites'): - by_tile_aop(dpid=self.dpid, site=invalid_site, year=self.year, - easting=self.easting, northing=self.northing) + invalid_site = "McRae" + with self.assertRaises( + ValueError, + msg=f"{invalid_site} is an invalid site format. A four-letter NEON site code is required. NEON sites codes can be found here: https://www.neonscience.org/field-sites/explore-field-sites", + ): + by_tile_aop( + dpid=self.dpid, + site=invalid_site, + year=self.year, + easting=self.easting, + northing=self.northing, + ) def test_invalid_neon_site(self): - invalid_site = 'ABBA' - with self.assertRaises(ValueError, msg=f'{invalid_site} is not a valid NEON site code. A complete list of NEON site codes can be found here: https://www.neonscience.org/field-sites/explore-field-sites'): - by_tile_aop(dpid=self.dpid, site=invalid_site, year=self.year, - easting=self.easting, northing=self.northing) - - @parameterized.expand([("21",), ("2021-05",), ]) + invalid_site = "ABBA" + with self.assertRaises( + ValueError, + msg=f"{invalid_site} is not a valid NEON site code. A complete list of NEON site codes can be found here: https://www.neonscience.org/field-sites/explore-field-sites", + ): + by_tile_aop( + dpid=self.dpid, + site=invalid_site, + year=self.year, + easting=self.easting, + northing=self.northing, + ) + + @parameterized.expand( + [ + ("21",), + ("2021-05",), + ] + ) def test_invalid_year_format(self, year): """ Test that invalid year format raises ValueError and message displays correctly. """ - with pytest.raises(ValueError, match=f'{year} is an invalid year. Year is required in the format "2017" or 2017, eg. AOP data are available from 2013 to present.'): - by_tile_aop(dpid=self.dpid, site=self.site, year=year, - easting=self.easting, northing=self.northing) - - @parameterized.expand([ - ("2022", "CHEQ", "STEI"), - ("2022", "TREE", "STEI"), - ("2021", "DCFS", "WOOD"), - ("2020", "KONA", "KONZ"), - ], name_func=lambda f, n, p: f'{f.__name__}_{p.args[1]}_{p.args[0]}') + with pytest.raises( + ValueError, + match=f'{year} is an invalid year. Year is required in the format "2017" or 2017, eg. AOP data are available from 2013 to present.', + ): + by_tile_aop( + dpid=self.dpid, + site=self.site, + year=year, + easting=self.easting, + northing=self.northing, + ) + + @parameterized.expand( + [ + ("2022", "CHEQ", "STEI"), + ("2022", "TREE", "STEI"), + ("2021", "DCFS", "WOOD"), + ("2020", "KONA", "KONZ"), + ], + name_func=lambda f, n, p: f"{f.__name__}_{p.args[1]}_{p.args[0]}", + ) # the name_func displays a more descriptive test name when running the tests - @patch('builtins.input', return_value='n') - def test_collocated_terrestrial_site_message(self, year, site, flightSite, input_mock): + @patch("builtins.input", return_value="n") + def test_collocated_terrestrial_site_message( + self, year, site, flightSite, input_mock + ): """ Test application of the terrestrial collocated site lookup, and expected message display. """ - with self.assertLogs(level='INFO') as cm: - by_tile_aop(dpid=self.dpid, site=site, - year=year, easting=[], northing=[]) + with self.assertLogs(level="INFO") as cm: + by_tile_aop(dpid=self.dpid, site=site, year=year, easting=[], northing=[]) self.assertIn( - f'INFO:root:{site} is part of the flight box for {flightSite}. Downloading data from {flightSite}.', cm.output) - - @parameterized.expand([ - ("2018", "BARC", "OSBS"), - ("2020", "COMO", "NIWO"), - # ("2021", "BLDE", "YELL"), - # ("2020", "KING", "KONZ"), - ], name_func=lambda f, n, p: f'{f.__name__}_{p.args[1]}_{p.args[0]}') - @patch('builtins.input', return_value='n') + f"INFO:root:{site} is part of the flight box for {flightSite}. Downloading data from {flightSite}.", + cm.output, + ) + + @parameterized.expand( + [ + ("2018", "BARC", "OSBS"), + ("2020", "COMO", "NIWO"), + # ("2021", "BLDE", "YELL"), + # ("2020", "KING", "KONZ"), + ], + name_func=lambda f, n, p: f"{f.__name__}_{p.args[1]}_{p.args[0]}", + ) + @patch("builtins.input", return_value="n") def test_collocated_aquatic_site_message(self, year, site, flightSite, input_mock): """ Test application of the aquatic collocated site lookup, and expected message display. """ - with self.assertLogs(level='INFO') as cm: - by_tile_aop(dpid=self.dpid, site=site, - year=year, easting=[], northing=[]) + with self.assertLogs(level="INFO") as cm: + by_tile_aop(dpid=self.dpid, site=site, year=year, easting=[], northing=[]) self.assertIn( - f'INFO:root:{site} is an aquatic site and is sometimes included in the flight box for {flightSite}. Aquatic sites are not always included in the flight coverage every year.\nDownloading data from {flightSite}. Check data to confirm coverage of {site}.', cm.output) + f"INFO:root:{site} is an aquatic site and is sometimes included in the flight box for {flightSite}. Aquatic sites are not always included in the flight coverage every year.\nDownloading data from {flightSite}. Check data to confirm coverage of {site}.", + cm.output, + ) def test_no_data_available_message(self): """ Test that the by_tile_aop() function returns the expected error log when no data is available for a selected site and year. """ - with self.assertLogs(level='INFO') as cm: - by_tile_aop(dpid="DP3.30015.001", site=self.site, - year=2020, easting=self.easting, northing=self.northing) + with self.assertLogs(level="INFO") as cm: + by_tile_aop( + dpid="DP3.30015.001", + site=self.site, + year=2020, + easting=self.easting, + northing=self.northing, + ) self.assertIn( - f'INFO:root:There are no DP3.30015.001 data available at the site {self.site} in 2020.\nTo display available dates for a given data product and site, use the function list_available_dates().', cm.output) + f"INFO:root:There are no DP3.30015.001 data available at the site {self.site} in 2020.\nTo display available dates for a given data product and site, use the function list_available_dates().", + cm.output, + ) # 'INFO:root:There are no data available at the selected site and year.', cm.output) def test_no_data_files_found_message(self): """ Test that the by_tile_aop() function returns the expected error log when no data files are found. """ - with self.assertLogs(level='INFO') as cm: - by_tile_aop(dpid="DP3.30015.001", site=self.site, - year=2020, easting=564000, northing=4900000) + with self.assertLogs(level="INFO") as cm: + by_tile_aop( + dpid="DP3.30015.001", + site=self.site, + year=2020, + easting=564000, + northing=4900000, + ) self.assertIn( - f'INFO:root:There are no DP3.30015.001 data available at the site {self.site} in 2020.\nTo display available dates for a given data product and site, use the function list_available_dates().', cm.output) + f"INFO:root:There are no DP3.30015.001 data available at the site {self.site} in 2020.\nTo display available dates for a given data product and site, use the function list_available_dates().", + cm.output, + ) - @ patch('builtins.input', return_value='n') + @patch("builtins.input", return_value="n") def test_check_download_size_message(self, input_mock): """ Test that download check_size message displays correctly. """ - result = by_tile_aop(dpid=self.dpid, site=self.site, - year=self.year, easting=self.easting, northing=self.northing) + result = by_tile_aop( + dpid=self.dpid, + site=self.site, + year=self.year, + easting=self.easting, + northing=self.northing, + ) # Check that the function asked for confirmation to download and prints expected message. input_mock.assert_called_once_with( - 'Continuing will download 7 files totaling approximately 3.9 MB. Do you want to proceed? (y/n) ') + "Continuing will download 7 files totaling approximately 3.9 MB. Do you want to proceed? (y/n) " + ) # Check that the function halted the download self.assertEqual(result, None) @@ -326,26 +454,41 @@ def test_all_provisional_no_data_available_message(self): Test that the by_tile_aop() function returns the expected message when include_provisional is set to False (default) but no data are available. This has already run through the check that any data is available(eg. there is data at that site for the year provided) """ - with self.assertLogs(level='INFO') as cm: - by_tile_aop(dpid=self.dpid, site='WLOU', year=2024, - easting=self.easting, northing=self.northing) + with self.assertLogs(level="INFO") as cm: + by_tile_aop( + dpid=self.dpid, + site="WLOU", + year=2024, + easting=self.easting, + northing=self.northing, + ) self.assertIn( - 'INFO:root:No data files found. Available data may all be provisional. To download provisional data, use input parameter include_provisional=True.', cm.output) + "INFO:root:No data files found. Available data may all be provisional. To download provisional data, use input parameter include_provisional=True.", + cm.output, + ) # provisional included, and no data available - @ patch('builtins.input', return_value='n') + @patch("builtins.input", return_value="n") def test_provisional_included_and_data_available_message(self, input_mock): """ Test that the by_file_aop() function returns the expected message when include_provisional is set to False (default) but no data are available. This has already run through the check that any data is available(eg. there is data at that site for the year provided) """ - with self.assertLogs(level='INFO') as cm: - by_tile_aop(dpid=self.dpid, site=self.site, year=2023, - include_provisional=True, easting=self.easting, northing=self.northing) + with self.assertLogs(level="INFO") as cm: + by_tile_aop( + dpid=self.dpid, + site=self.site, + year=2023, + include_provisional=True, + easting=self.easting, + northing=self.northing, + ) self.assertIn( - 'INFO:root:Provisional data are included. To exclude provisional data, use input parameter include_provisional=False.', cm.output) + "INFO:root:Provisional data are included. To exclude provisional data, use input parameter include_provisional=False.", + cm.output, + ) -# pyproj is part of the requirements, so this is not needed - this was carried over from the R package + # pyproj is part of the requirements, so this is not needed - this was carried over from the R package # @ patch('builtins.input', return_value='n') # @ patch('importlib.import_module') # @ patch('logging.info') @@ -364,20 +507,30 @@ def test_provisional_included_and_data_available_message(self, input_mock): # "Package pyproj is required for this function to work at the BLAN site. Install and re-try." # ) - @ patch('builtins.input', return_value='n') + @patch("builtins.input", return_value="n") def test_blan_utm_info_message(self, input_mock): """ Test that the by_tile_aop() function returns the expected message about UTM zone conversion when BLAN is the site. """ - with self.assertLogs(level='INFO') as cm: - by_tile_aop(dpid="DP3.30015.001", site='BLAN', year=2022, - easting=243758.81, northing=4330667.37, verbose=True) - self.assertIn('INFO:root:Blandy (BLAN) plots include two UTM zones, flight data are all in 17N. ' - 'Coordinates in UTM zone 18N have been converted to 17N to download the correct tiles. ' - 'You will need to make the same conversion to connect airborne to ground data.', cm.output) - self.assertIn('INFO:root:UTM 17N Easting(s): 762717.81', cm.output) + with self.assertLogs(level="INFO") as cm: + by_tile_aop( + dpid="DP3.30015.001", + site="BLAN", + year=2022, + easting=243758.81, + northing=4330667.37, + verbose=True, + ) self.assertIn( - 'INFO:root:UTM 17N Northing(s): 4330881.38', cm.output) + "INFO:root:Blandy (BLAN) plots include two UTM zones, flight data are all in 17N. " + "Coordinates in UTM zone 18N have been converted to 17N to download the correct tiles. " + "You will need to make the same conversion to connect airborne to ground data.", + cm.output, + ) + self.assertIn("INFO:root:UTM 17N Easting(s): 762717.81", cm.output) + self.assertIn("INFO:root:UTM 17N Northing(s): 4330881.38", cm.output) self.assertIn( - 'INFO:root:UTM (x, y) lower-left coordinates of tiles to be downloaded:', cm.output) - self.assertIn('INFO:root:(762000, 4330000)', cm.output) + "INFO:root:UTM (x, y) lower-left coordinates of tiles to be downloaded:", + cm.output, + ) + self.assertIn("INFO:root:(762000, 4330000)", cm.output) diff --git a/tests/test_get_citation.py b/tests/test_get_citation.py index 660e472..992e90f 100644 --- a/tests/test_get_citation.py +++ b/tests/test_get_citation.py @@ -25,7 +25,7 @@ def test_get_citation_provisional(): Test that the get_citation() function returns the expected citation for provisional data """ cit = get_citation(dpid="DP1.10003.001", release="PROVISIONAL") - citexp = '@misc{DP1.10003.001/provisional,\n doi = {},\n url = {https://data.neonscience.org/data-products/DP1.10003.001},\n author = {{National Ecological Observatory Network (NEON)}},\n language = {en},\n title = {Breeding landbird point counts (DP1.10003.001)},\n publisher = {National Ecological Observatory Network (NEON)},\n year = {2025}\n}' + citexp = "@misc{DP1.10003.001/provisional,\n doi = {},\n url = {https://data.neonscience.org/data-products/DP1.10003.001},\n author = {{National Ecological Observatory Network (NEON)}},\n language = {en},\n title = {Breeding landbird point counts (DP1.10003.001)},\n publisher = {National Ecological Observatory Network (NEON)},\n year = {2025}\n}" assert cit == citexp @@ -34,5 +34,5 @@ def test_get_citation_release(): Test that the get_citation() function returns the expected citation for a Release """ cit = get_citation(dpid="DP1.10098.001", release="RELEASE-2023") - citexp = '@misc{https://doi.org/10.48443/73zn-k414,\n doi = {10.48443/73ZN-K414},\n url = {https://data.neonscience.org/data-products/DP1.10098.001/RELEASE-2023},\n author = {{National Ecological Observatory Network (NEON)}},\n keywords = {plant productivity, production, carbon cycle, biomass, vegetation, productivity, plants, trees, shrubs, lianas, saplings, net primary productivity (NPP), annual net primary productivity (ANPP), woody plants, vegetation structure, tree height, canopy height, vst},\n language = {en},\n title = {Vegetation structure (DP1.10098.001)},\n publisher = {National Ecological Observatory Network (NEON)},\n year = {2023}\n}\n' + citexp = "@misc{https://doi.org/10.48443/73zn-k414,\n doi = {10.48443/73ZN-K414},\n url = {https://data.neonscience.org/data-products/DP1.10098.001/RELEASE-2023},\n author = {{National Ecological Observatory Network (NEON)}},\n keywords = {plant productivity, production, carbon cycle, biomass, vegetation, productivity, plants, trees, shrubs, lianas, saplings, net primary productivity (NPP), annual net primary productivity (ANPP), woody plants, vegetation structure, tree height, canopy height, vst},\n language = {en},\n title = {Vegetation structure (DP1.10098.001)},\n publisher = {National Ecological Observatory Network (NEON)},\n year = {2023}\n}\n" assert cit == citexp diff --git a/tests/test_metadata_helpers.py b/tests/test_metadata_helpers.py index c674529..076601f 100644 --- a/tests/test_metadata_helpers.py +++ b/tests/test_metadata_helpers.py @@ -10,6 +10,7 @@ import unittest from src.neonutilities.helper_mods.metadata_helpers import convert_byte_size + class TestConvertByteSize(unittest.TestCase): def test_convert_byte_size(self): """ @@ -17,22 +18,23 @@ def test_convert_byte_size(self): """ # Test bytes self.assertEqual(convert_byte_size(500), "500.0 B") - + # Test kilobytes self.assertEqual(convert_byte_size(1024), "1.0 KB") self.assertEqual(convert_byte_size(1536), "1.5 KB") - + # Test megabytes self.assertEqual(convert_byte_size(1048576), "1.0 MB") self.assertEqual(convert_byte_size(1572864), "1.5 MB") - + # Test gigabytes self.assertEqual(convert_byte_size(1073741824), "1.0 GB") self.assertEqual(convert_byte_size(1610612736), "1.5 GB") - + # Test terabytes self.assertEqual(convert_byte_size(1099511627776), "1.0 TB") self.assertEqual(convert_byte_size(1649267441664), "1.5 TB") + if __name__ == "__main__": - unittest.main() \ No newline at end of file + unittest.main() diff --git a/tests/test_zips_by_product.py b/tests/test_zips_by_product.py index 7f3962d..3618a5d 100644 --- a/tests/test_zips_by_product.py +++ b/tests/test_zips_by_product.py @@ -21,9 +21,13 @@ def test_zips_by_product_dpid(): Test that the zips_by_product() function errors correctly for an invalid DPID """ with pytest.raises(ValueError) as exc_info: - zips_by_product(dpid='DP1.444.001', site='NIWO', - startdate='2012-01', enddate='2022-12') - assert str(exc_info.value) == "DP1.444.001 is not a properly formatted data product ID. The correct format is DP#.#####.00#" + zips_by_product( + dpid="DP1.444.001", site="NIWO", startdate="2012-01", enddate="2022-12" + ) + assert ( + str(exc_info.value) + == "DP1.444.001 is not a properly formatted data product ID. The correct format is DP#.#####.00#" + ) def test_zips_by_product_site(caplog): @@ -32,21 +36,41 @@ def test_zips_by_product_site(caplog): """ caplog.set_level(logging.INFO) - zips_by_product(dpid='DP1.10003.001', site=['OKSR', 'ARIK'], - startdate='2012-01', enddate='2022-12') + zips_by_product( + dpid="DP1.10003.001", + site=["OKSR", "ARIK"], + startdate="2012-01", + enddate="2022-12", + ) + + assert any( + "There are no data at the selected sites." in record.message + for record in caplog.records + ) - assert any("There are no data at the selected sites." in record.message for record in caplog.records) - def test_zips_by_product_cloud(): """ Test that running in cloud mode returns the correct list of files """ - murls = zips_by_product(dpid='DP1.10003.001', site=['NIWO', 'PUUM'], - startdate='2019-01', enddate='2019-12', - release='RELEASE-2024', - check_size=False, cloud_mode=True) - lst = ['https://storage.googleapis.com/neon-publication/NEON.DOM.SITE.DP1.10003.001/NIWO/20190701T000000--20190801T000000/basic/NEON.D13.NIWO.DP1.10003.001.brd_perpoint.2019-07.basic.20231227T192510Z.csv', 'https://storage.googleapis.com/neon-publication/release/tag/RELEASE-2024/NEON.DOM.SITE.DP1.10003.001/NIWO/20190701T000000--20190801T000000/basic/NEON.D13.NIWO.DP1.10003.001.EML.20190703-20190713.20240127T000425Z.xml', 'https://storage.googleapis.com/neon-publication/NEON.DOM.SITE.DP1.10003.001/NIWO/20190701T000000--20190801T000000/basic/NEON.D13.NIWO.DP1.10003.001.variables.20231227T192510Z.csv', 'https://storage.googleapis.com/neon-publication/NEON.DOM.SITE.DP1.10003.001/NIWO/20190701T000000--20190801T000000/basic/NEON.D13.NIWO.DP0.10003.001.validation.20231227T192510Z.csv', 'https://storage.googleapis.com/neon-publication/NEON.DOM.SITE.DP1.10003.001/NIWO/20190701T000000--20190801T000000/basic/NEON.D13.NIWO.DP1.10003.001.brd_countdata.2019-07.basic.20231227T192510Z.csv', 'https://storage.googleapis.com/neon-publication/release/tag/RELEASE-2024/NEON.DOM.SITE.DP1.10003.001/NIWO/20190701T000000--20190801T000000/basic/NEON.D13.NIWO.DP1.10003.001.readme.20240127T000425Z.txt', 'https://storage.googleapis.com/neon-publication/NEON.DOM.SITE.DP1.10003.001/NIWO/20190701T000000--20190801T000000/basic/NEON.D13.NIWO.DP0.10003.001.categoricalCodes.20231227T192510Z.csv'] + murls = zips_by_product( + dpid="DP1.10003.001", + site=["NIWO", "PUUM"], + startdate="2019-01", + enddate="2019-12", + release="RELEASE-2024", + check_size=False, + cloud_mode=True, + ) + lst = [ + "https://storage.googleapis.com/neon-publication/NEON.DOM.SITE.DP1.10003.001/NIWO/20190701T000000--20190801T000000/basic/NEON.D13.NIWO.DP1.10003.001.brd_perpoint.2019-07.basic.20231227T192510Z.csv", + "https://storage.googleapis.com/neon-publication/release/tag/RELEASE-2024/NEON.DOM.SITE.DP1.10003.001/NIWO/20190701T000000--20190801T000000/basic/NEON.D13.NIWO.DP1.10003.001.EML.20190703-20190713.20240127T000425Z.xml", + "https://storage.googleapis.com/neon-publication/NEON.DOM.SITE.DP1.10003.001/NIWO/20190701T000000--20190801T000000/basic/NEON.D13.NIWO.DP1.10003.001.variables.20231227T192510Z.csv", + "https://storage.googleapis.com/neon-publication/NEON.DOM.SITE.DP1.10003.001/NIWO/20190701T000000--20190801T000000/basic/NEON.D13.NIWO.DP0.10003.001.validation.20231227T192510Z.csv", + "https://storage.googleapis.com/neon-publication/NEON.DOM.SITE.DP1.10003.001/NIWO/20190701T000000--20190801T000000/basic/NEON.D13.NIWO.DP1.10003.001.brd_countdata.2019-07.basic.20231227T192510Z.csv", + "https://storage.googleapis.com/neon-publication/release/tag/RELEASE-2024/NEON.DOM.SITE.DP1.10003.001/NIWO/20190701T000000--20190801T000000/basic/NEON.D13.NIWO.DP1.10003.001.readme.20240127T000425Z.txt", + "https://storage.googleapis.com/neon-publication/NEON.DOM.SITE.DP1.10003.001/NIWO/20190701T000000--20190801T000000/basic/NEON.D13.NIWO.DP0.10003.001.categoricalCodes.20231227T192510Z.csv", + ] assert murls[0] == lst @@ -54,12 +78,22 @@ def test_zips_by_product_avg(): """ Test that download by averaging interval returns the correct list of files """ - murls = zips_by_product(dpid='DP1.00005.001', site=['NIWO', 'PUUM'], - startdate='2022-06', enddate='2022-07', - timeindex=30, check_size=False, progress=False, - release='RELEASE-2024', cloud_mode=True) + murls = zips_by_product( + dpid="DP1.00005.001", + site=["NIWO", "PUUM"], + startdate="2022-06", + enddate="2022-07", + timeindex=30, + check_size=False, + progress=False, + release="RELEASE-2024", + cloud_mode=True, + ) assert len(murls) == 2 ml = murls[0] assert len(ml) == 29 - assert "https://storage.googleapis.com/neon-publication/release/tag/RELEASE-2024/NEON.DOM.SITE.DP1.00005.001/NIWO/20220701T000000--20220801T000000/basic/NEON.D13.NIWO.DP1.00005.001.readme.20240127T000425Z.txt" in ml - assert list(murls[1].values())[0] == 'RELEASE-2024' + assert ( + "https://storage.googleapis.com/neon-publication/release/tag/RELEASE-2024/NEON.DOM.SITE.DP1.00005.001/NIWO/20220701T000000--20220801T000000/basic/NEON.D13.NIWO.DP1.00005.001.readme.20240127T000425Z.txt" + in ml + ) + assert list(murls[1].values())[0] == "RELEASE-2024"